Web-scraping 使用 selenium 的代码和 beautifulsoup 无法正常工作
Web-scraping code using selenium and beautifulsoup not working properly
我为 web-scraping 悉尼先驱晨报编写了 python 代码。此代码首先单击所有显示更多按钮,然后抓取所有文章。 Selenium 部分工作正常。但我认为抓取部分存在一些问题,因为在抓取几篇文章(5-6)所需的字段(日期、标题和内容)后,它只提供日期和标题,没有内容。
import time
import csv
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
base = 'https://www.smh.com.au'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get('https://www.smh.com.au/search?text=cybersecurity')
while True:
try:
time.sleep(2)
show_more = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_3we9i')))
show_more.click()
except Exception as e:
print(e)
break
soup = BeautifulSoup(browser.page_source,'lxml')
anchors = soup.find_all('a', {'tabindex': '-1'})
for anchor in anchors:
browser.get(base + anchor['href'])
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
dateTag = sub_soup.find('time', {'class': '_2_zR-'})
titleTag = sub_soup.find('h1', {'itemprop': 'headline'})
contentTag = sub_soup.find_all('div', {'class': '_1665V undefined'})
date = None
title = None
content = None
if isinstance(dateTag, Tag):
date = dateTag.get_text().strip()
if isinstance(titleTag, Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag, list):
content = []
for c in contentTag:
content.append(c.get_text().strip())
content = ' '.join(content)
print(f'{date}\n {title}\n {content}\n')
time.sleep(3)
browser.close()
为什么这段代码在几篇文章之后就停止给出内容部分了?我不明白。
谢谢。
因为You've reached your monthly free access limit
显示几页后网页上显示的消息。
我为 web-scraping 悉尼先驱晨报编写了 python 代码。此代码首先单击所有显示更多按钮,然后抓取所有文章。 Selenium 部分工作正常。但我认为抓取部分存在一些问题,因为在抓取几篇文章(5-6)所需的字段(日期、标题和内容)后,它只提供日期和标题,没有内容。
import time
import csv
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
base = 'https://www.smh.com.au'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get('https://www.smh.com.au/search?text=cybersecurity')
while True:
try:
time.sleep(2)
show_more = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_3we9i')))
show_more.click()
except Exception as e:
print(e)
break
soup = BeautifulSoup(browser.page_source,'lxml')
anchors = soup.find_all('a', {'tabindex': '-1'})
for anchor in anchors:
browser.get(base + anchor['href'])
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
dateTag = sub_soup.find('time', {'class': '_2_zR-'})
titleTag = sub_soup.find('h1', {'itemprop': 'headline'})
contentTag = sub_soup.find_all('div', {'class': '_1665V undefined'})
date = None
title = None
content = None
if isinstance(dateTag, Tag):
date = dateTag.get_text().strip()
if isinstance(titleTag, Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag, list):
content = []
for c in contentTag:
content.append(c.get_text().strip())
content = ' '.join(content)
print(f'{date}\n {title}\n {content}\n')
time.sleep(3)
browser.close()
为什么这段代码在几篇文章之后就停止给出内容部分了?我不明白。
谢谢。
因为You've reached your monthly free access limit
显示几页后网页上显示的消息。