我无法使用常用的网络抓取工具从网站上抓取 table
I cannot scrape a table from a website with usual web scraping tools
我正在尝试从 Python 的网站上抓取 table,但由于某种原因,我所有已知的方法都失败了。有一个 table 在
https://www.nbc4i.com/news/state-news/535-new-cases-of-covid-19-reported-in-ohio-schools-in-past-week/
有 45 页。我尝试使用以下方法来抓取它:requests、requests-html(呈现它)、BeautifulSoup 和 selenium。这是我的代码之一,我不会在这里复制我尝试过的所有代码,方法是相似的,只是使用不同的 Python 库:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()
page = session.get('https://www.nbc4i.com/news/state-news/535-new-cases-of-covid-19-reported-in-ohio-schools-in-past-week/')
page.html.render(timeout=120)
soup = BeautifulSoup(page.content, 'lxml') #also tried with page.text and 'html.parser' and all permutations
table = soup.find_all(id='table')
我的 table 变量在这里是一个空列表,它不应该是。我试图用 selenium 在 table 中找到任何其他 Web 元素,我也试图通过 class、xpath 找到,但所有这些都未能找到 table 或它的任何部分。我用这些方法抓取了很少的类似网站,在此之前我从未遇到过问题。
有什么想法吗?
table内容在iframe中,需要切换到iframe页面。参见 API docs。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.nbc4i.com/news/state-news/535-new-cases-of-covid-19-reported-in-ohio-schools-in-past-week/'
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
try:
driver.get(url)
driver.implicitly_wait(5)
driver.switch_to.frame(driver.find_element(By.XPATH,
'//div[@class="ns-block-custom-html"]/div/iframe'))
# table content is now in the driver context
while True:
table = driver.find_element(By.ID, "table")
for elt in table.find_elements(By.CLASS_NAME, "body-row"):
items = [td.text for td in elt.find_elements(By.CLASS_NAME, "td")]
# add code to append each of row of data to CSV file, database, etc.
print(items)
next_btn = driver.find_element(By.CLASS_NAME, 'next')
if 'disabled' in next_btn.get_attribute('class'):
# no more > done with pagination
break
next_btn.click() # click next button for next set of items
finally:
driver.quit()
输出:
['1', 'Delaware', 'Olentangy Local', 'Public District', '38', '31', '7']
...
['446', 'Muskingum', 'West Muskingum Local', 'Public District', '1', '1', '0']
您会看到结果 table 在 iframe 中。您可以直接从 iframe 的源中提取信息:
这里是将结果保存到 .csv 文件的代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
def get_rows(driver):
"""
returns rows from a page
Returns:
Dict
"""
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[@class='tr body-row']")))
rows = driver.find_elements(By.XPATH, "//div[@class='tr body-row']")
table_info= {
'Rank': [],
'County':[],
'School/District':[],
'Type':[],
'Total cases':[],
'Student cases':[],
'Staff cases':[]
}
for row in rows:
cols = row.find_elements(By.CLASS_NAME, 'td')
for col, index in enumerate(table_info):
table_info[index].append(cols[col].text)
return table_info
# path to chrome driver
driver = webdriver.Chrome("D:\chromedriver\chromedriver.exe")
driver.get("https://flo.uri.sh/visualisation/3894531/embed?auto=1")
df = pd.DataFrame.from_dict(get_rows(driver))
for _ in range(44):
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//button[@class="pagination-btn next"]'))).click()
df = pd.concat([df, pd.DataFrame.from_dict(get_rows(driver))])
print(df)
df.to_csv('COVID-19_cases_reported_in_Ohio_schools.csv', index=False)
我正在尝试从 Python 的网站上抓取 table,但由于某种原因,我所有已知的方法都失败了。有一个 table 在 https://www.nbc4i.com/news/state-news/535-new-cases-of-covid-19-reported-in-ohio-schools-in-past-week/ 有 45 页。我尝试使用以下方法来抓取它:requests、requests-html(呈现它)、BeautifulSoup 和 selenium。这是我的代码之一,我不会在这里复制我尝试过的所有代码,方法是相似的,只是使用不同的 Python 库:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()
page = session.get('https://www.nbc4i.com/news/state-news/535-new-cases-of-covid-19-reported-in-ohio-schools-in-past-week/')
page.html.render(timeout=120)
soup = BeautifulSoup(page.content, 'lxml') #also tried with page.text and 'html.parser' and all permutations
table = soup.find_all(id='table')
我的 table 变量在这里是一个空列表,它不应该是。我试图用 selenium 在 table 中找到任何其他 Web 元素,我也试图通过 class、xpath 找到,但所有这些都未能找到 table 或它的任何部分。我用这些方法抓取了很少的类似网站,在此之前我从未遇到过问题。 有什么想法吗?
table内容在iframe中,需要切换到iframe页面。参见 API docs。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.nbc4i.com/news/state-news/535-new-cases-of-covid-19-reported-in-ohio-schools-in-past-week/'
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
try:
driver.get(url)
driver.implicitly_wait(5)
driver.switch_to.frame(driver.find_element(By.XPATH,
'//div[@class="ns-block-custom-html"]/div/iframe'))
# table content is now in the driver context
while True:
table = driver.find_element(By.ID, "table")
for elt in table.find_elements(By.CLASS_NAME, "body-row"):
items = [td.text for td in elt.find_elements(By.CLASS_NAME, "td")]
# add code to append each of row of data to CSV file, database, etc.
print(items)
next_btn = driver.find_element(By.CLASS_NAME, 'next')
if 'disabled' in next_btn.get_attribute('class'):
# no more > done with pagination
break
next_btn.click() # click next button for next set of items
finally:
driver.quit()
输出:
['1', 'Delaware', 'Olentangy Local', 'Public District', '38', '31', '7']
...
['446', 'Muskingum', 'West Muskingum Local', 'Public District', '1', '1', '0']
您会看到结果 table 在 iframe 中。您可以直接从 iframe 的源中提取信息:
这里是将结果保存到 .csv 文件的代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
def get_rows(driver):
"""
returns rows from a page
Returns:
Dict
"""
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[@class='tr body-row']")))
rows = driver.find_elements(By.XPATH, "//div[@class='tr body-row']")
table_info= {
'Rank': [],
'County':[],
'School/District':[],
'Type':[],
'Total cases':[],
'Student cases':[],
'Staff cases':[]
}
for row in rows:
cols = row.find_elements(By.CLASS_NAME, 'td')
for col, index in enumerate(table_info):
table_info[index].append(cols[col].text)
return table_info
# path to chrome driver
driver = webdriver.Chrome("D:\chromedriver\chromedriver.exe")
driver.get("https://flo.uri.sh/visualisation/3894531/embed?auto=1")
df = pd.DataFrame.from_dict(get_rows(driver))
for _ in range(44):
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//button[@class="pagination-btn next"]'))).click()
df = pd.concat([df, pd.DataFrame.from_dict(get_rows(driver))])
print(df)
df.to_csv('COVID-19_cases_reported_in_Ohio_schools.csv', index=False)