I am scraping Html table they show me the error 'AttributeError: 'NoneType' object has no attribute 'select''
I am scraping Html table they show me the error 'AttributeError: 'NoneType' object has no attribute 'select''
我正在抓取 Html table 他们告诉我错误 'AttributeError: 'NoneType' object has no attribute 'select' try to solve it
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
soup = BeautifulSoup(r.content, "lxml")
table = soup.find('table',attrs={'style':"border"})
all_data = []
for row in table.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in row.select("td")]
all_data.append(tds)
df = pd.DataFrame(all_data, columns=header)
print(df)
您尝试抓取的网站似乎阻止了 requests
图书馆发送的请求。为了解决这个问题,我使用了 Selenium
库来自动浏览网站。下面的代码收集了 table.
中给出的标题
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
browser = webdriver.Chrome()
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
soup = BeautifulSoup(browser.page_source, "lxml")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
all_data = [i.text.strip() for i in soup.select("option")]
df = pd.DataFrame(all_data, columns=["Titles"])
print(df)
输出:
Titles
0 Agree Realty Corporation (ADC)
1 American Campus Communities, Inc. (ACC)
2 Antero Midstream Corporation (AM)
3 Antero Resources Corporation (AR)
4 Apache Corp. (APA)
.. ...
126 W. P. Carey Inc. (WPC)
127 Washington Real Estate Investment Trust (WRE)
128 Welltower Inc. (WELL)
129 Western Midstream Partners, LP (WES)
130 Whiting Petroleum Corporation (WLL)
如果你之前没有使用过Selenium
,不要忘记安装chromedriver.exe
并将其添加到PATH环境变量中。您也可以手动将 driver 的位置提供给构造函数。
更新代码以提取额外信息
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
browser = webdriver.Chrome()
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
all_data = []
ratings = {"BUY":[], "HOLD":[], "SELL":[]}
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
ratings["BUY"] = lists_[0]
ratings["HOLD"] = lists_[1]
ratings["SELL"] = lists_[2]
print(ratings)
browser.switch_to.default_content()
我正在抓取 Html table 他们告诉我错误 'AttributeError: 'NoneType' object has no attribute 'select' try to solve it
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
soup = BeautifulSoup(r.content, "lxml")
table = soup.find('table',attrs={'style':"border"})
all_data = []
for row in table.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in row.select("td")]
all_data.append(tds)
df = pd.DataFrame(all_data, columns=header)
print(df)
您尝试抓取的网站似乎阻止了 requests
图书馆发送的请求。为了解决这个问题,我使用了 Selenium
库来自动浏览网站。下面的代码收集了 table.
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
browser = webdriver.Chrome()
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
soup = BeautifulSoup(browser.page_source, "lxml")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
all_data = [i.text.strip() for i in soup.select("option")]
df = pd.DataFrame(all_data, columns=["Titles"])
print(df)
输出:
Titles
0 Agree Realty Corporation (ADC)
1 American Campus Communities, Inc. (ACC)
2 Antero Midstream Corporation (AM)
3 Antero Resources Corporation (AR)
4 Apache Corp. (APA)
.. ...
126 W. P. Carey Inc. (WPC)
127 Washington Real Estate Investment Trust (WRE)
128 Welltower Inc. (WELL)
129 Western Midstream Partners, LP (WES)
130 Whiting Petroleum Corporation (WLL)
如果你之前没有使用过Selenium
,不要忘记安装chromedriver.exe
并将其添加到PATH环境变量中。您也可以手动将 driver 的位置提供给构造函数。
更新代码以提取额外信息
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
browser = webdriver.Chrome()
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
all_data = []
ratings = {"BUY":[], "HOLD":[], "SELL":[]}
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
ratings["BUY"] = lists_[0]
ratings["HOLD"] = lists_[1]
ratings["SELL"] = lists_[2]
print(ratings)
browser.switch_to.default_content()