使用 BeautifulSoup 和 Mechanize 登录网页
Login into web page using BeautifulSoup and Mechanize
我正在尝试使用 BeautifulSoup 和 Mechanize 以编程方式登录网页。
这是我的代码:
#import urllib2
from mechanize import Browser, _http, urlopen
from BeautifulSoup import BeautifulSoup
import cookielib
data_url = "http://data.theice.com/ViewData/EndOfDay/LdnOptions.aspx?p=AER"
def are_we_logged_on(html):
soup = BeautifulSoup(html)
elem = soup.find("input", {"id" : "ctl00_ContentPlaceHolder1_LoginControl_m_userName" } )
return elem is None
# Browser
br = Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
#br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(_http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0')]
# The site we will navigate into, handling it's session
response = br.open(data_url)
html = response.get_data()
# do we need to log in?
logged_on = are_we_logged_on(html)
if not logged_on :
print "DEBUG: Attempting to log in ..."
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_userName'] = 'username'
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_password'] = 'password'
# Login
post_url, post_data, headers = br.form.click_request_data()
print post_url
print post_data
print headers
resp = urlopen(post_url, post_data)
# Check if login succesful
html2 = resp.read()
logged_on = are_we_logged_on(html2)
if not logged_on:
with open("icedump_fail.html","w") as f:
f.write(html2)
print "DEBUG: Failed to logon. Aborting script ...!"
exit(-1)
# If we got this far, then we are logged in ...
当我 运行 脚本时,执行路径总是导致 "Failed to logon" 消息被打印到屏幕上。
任何人都可以发现我可能做错了什么吗?。我没有想法,也许需要一双新的眼睛。
打开 "debug" 模式 (br.set_debug_http(True)
) 帮助我检查了底层请求 mechanize
正在发送以提交登录表单并将其与实际发送的请求进行比较您使用浏览器登录。
这表明 __EVENTTARGET
参数被发送为空,但它不应该是空的。
这是帮助我解决问题的固定代码部分:
br.select_form(nr=0)
br.form.set_all_readonly(False)
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_userName'] = 'username'
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_password'] = 'password'
br.form['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$LoginControl$LoginButton'
# Login
response = br.submit()
html2 = response.read()
logged_on = are_we_logged_on(html2)
作为旁注,请确保您没有违反协议 "digitally signing" while registering at "ICE":
Scraping:
The scraping of this website for the purpose of extracting data automatically from this website is strictly prohibited
BY ICE and it should be noted that this process could result in a
drain on ICE's system resources. ICE (or its affiliates, agents or
contractors) may monitor usage of this website for scraping purposes
and may take all necessary actions to ensure that access to this
website is removed from entities carrying out or reasonably believed
to be carrying out web scraping activities.
我会使用 Selenium,因为它功能齐全而且功能更强大。您实际上也可以看到结果:
from selenium import webdriver
chrome = webdriver.Chrome()
chrome.get('http://data.theice.com/ViewData/EndOfDay/LdnOptions.aspx?p=AER')
user = chrome.find_element_by_name('ctl00$ContentPlaceHolder1$LoginControl$m_userName')
pswd = chrome.find_element_by_name('ctl00$ContentPlaceHolder1$LoginControl$m_password')
form = chrome.find_element_by_name('ctl00_ContentPlaceHolder1_LoginControl_LoginButton')
user.send_keys(your_username_string)
pswd.send_keys(your_password_string)
form.click() # hit the login button
我正在尝试使用 BeautifulSoup 和 Mechanize 以编程方式登录网页。
这是我的代码:
#import urllib2
from mechanize import Browser, _http, urlopen
from BeautifulSoup import BeautifulSoup
import cookielib
data_url = "http://data.theice.com/ViewData/EndOfDay/LdnOptions.aspx?p=AER"
def are_we_logged_on(html):
soup = BeautifulSoup(html)
elem = soup.find("input", {"id" : "ctl00_ContentPlaceHolder1_LoginControl_m_userName" } )
return elem is None
# Browser
br = Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
#br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(_http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0')]
# The site we will navigate into, handling it's session
response = br.open(data_url)
html = response.get_data()
# do we need to log in?
logged_on = are_we_logged_on(html)
if not logged_on :
print "DEBUG: Attempting to log in ..."
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_userName'] = 'username'
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_password'] = 'password'
# Login
post_url, post_data, headers = br.form.click_request_data()
print post_url
print post_data
print headers
resp = urlopen(post_url, post_data)
# Check if login succesful
html2 = resp.read()
logged_on = are_we_logged_on(html2)
if not logged_on:
with open("icedump_fail.html","w") as f:
f.write(html2)
print "DEBUG: Failed to logon. Aborting script ...!"
exit(-1)
# If we got this far, then we are logged in ...
当我 运行 脚本时,执行路径总是导致 "Failed to logon" 消息被打印到屏幕上。
任何人都可以发现我可能做错了什么吗?。我没有想法,也许需要一双新的眼睛。
打开 "debug" 模式 (br.set_debug_http(True)
) 帮助我检查了底层请求 mechanize
正在发送以提交登录表单并将其与实际发送的请求进行比较您使用浏览器登录。
这表明 __EVENTTARGET
参数被发送为空,但它不应该是空的。
这是帮助我解决问题的固定代码部分:
br.select_form(nr=0)
br.form.set_all_readonly(False)
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_userName'] = 'username'
br.form['ctl00$ContentPlaceHolder1$LoginControl$m_password'] = 'password'
br.form['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$LoginControl$LoginButton'
# Login
response = br.submit()
html2 = response.read()
logged_on = are_we_logged_on(html2)
作为旁注,请确保您没有违反协议 "digitally signing" while registering at "ICE":
Scraping:
The scraping of this website for the purpose of extracting data automatically from this website is strictly prohibited BY ICE and it should be noted that this process could result in a drain on ICE's system resources. ICE (or its affiliates, agents or contractors) may monitor usage of this website for scraping purposes and may take all necessary actions to ensure that access to this website is removed from entities carrying out or reasonably believed to be carrying out web scraping activities.
我会使用 Selenium,因为它功能齐全而且功能更强大。您实际上也可以看到结果:
from selenium import webdriver
chrome = webdriver.Chrome()
chrome.get('http://data.theice.com/ViewData/EndOfDay/LdnOptions.aspx?p=AER')
user = chrome.find_element_by_name('ctl00$ContentPlaceHolder1$LoginControl$m_userName')
pswd = chrome.find_element_by_name('ctl00$ContentPlaceHolder1$LoginControl$m_password')
form = chrome.find_element_by_name('ctl00_ContentPlaceHolder1_LoginControl_LoginButton')
user.send_keys(your_username_string)
pswd.send_keys(your_password_string)
form.click() # hit the login button