AttributeError: 'NoneType' object has no attribute 'find_all' (BeautifulSoup) (ChromeDriver)
AttributeError: 'NoneType' object has no attribute 'find_all' (BeautifulSoup) (ChromeDriver)
我正在尝试 运行 一些代码,但我在页面的循环部分收到错误消息 AttributeError: 'NoneType' object has no attribute 'find_all'。我认为 JS 检测到我正在使用 ChromeDriver 并阻止对网页的请求。
关于做什么的建议?
cards = []
pages = 5
for i in range(pages):
url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
for anuncio in anuncios:
card = {}
card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
card['location'] = get_text(anuncio.find('p', class_="detail-region"))
card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
card['link'] = get_link(anuncio.find('a', href=True))
if len(card['value']):
cards.append(card)
dataset = pd.DataFrame(cards)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_11539/2840841130.py in <module>
7
8 soup = BeautifulSoup(driver.page_source, 'html.parser')
----> 9 anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
10
11 for anuncio in anuncios:
AttributeError: 'NoneType' object has no attribute 'find_all'
完整代码
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
def get_text(bs_tag):
if bs_tag:
return bs_tag.get_text().strip().replace('\n', '').replace('\t', '')
else:
return ''
def get_link(bs_tag):
if bs_tag:
return bs_tag['href']
else:
return ''
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chromedriver, options=options)
driver.implicitly_wait(5)
cards = []
pages = 5
for i in range(pages):
url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
for anuncio in anuncios:
card = {}
card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
card['location'] = get_text(anuncio.find('p', class_="detail-region"))
card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
card['link'] = get_link(anuncio.find('a', href=True))
if len(card['value']):
cards.append(card)
dataset = pd.DataFrame(cards)
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
soup.find()
调用未找到任何内容,因此返回 None
。
所以你实际上是在说这个:
anuncios = None.find_all('li', class_="item")
这当然行不通。
您不必为此使用 Selenium,所有数据都存储在脚本标签中,可以像这样轻松抓取:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
final = []
for page in range(1,5):
url = f'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o={page}'
landing_page = requests.get(url,headers=headers)
print(f'Scraping page {page}')
soup = BeautifulSoup(landing_page.text,'html.parser')
dirty = soup.find('script',{'id':'initial-data'})['data-json']
clean = json.loads(dirty.replace('"','"'))
data = clean['listingProps']['adList']
for listing in data:
try:
listing.pop('images') #clean up csv
listing.pop('properties')
except:
continue #some listings don't have images/properties
final.append(listing)
df = pd.DataFrame(final)
df.to_csv('output.csv',index=False)
我正在尝试 运行 一些代码,但我在页面的循环部分收到错误消息 AttributeError: 'NoneType' object has no attribute 'find_all'。我认为 JS 检测到我正在使用 ChromeDriver 并阻止对网页的请求。
关于做什么的建议?
cards = []
pages = 5
for i in range(pages):
url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
for anuncio in anuncios:
card = {}
card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
card['location'] = get_text(anuncio.find('p', class_="detail-region"))
card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
card['link'] = get_link(anuncio.find('a', href=True))
if len(card['value']):
cards.append(card)
dataset = pd.DataFrame(cards)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_11539/2840841130.py in <module>
7
8 soup = BeautifulSoup(driver.page_source, 'html.parser')
----> 9 anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
10
11 for anuncio in anuncios:
AttributeError: 'NoneType' object has no attribute 'find_all'
完整代码
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
def get_text(bs_tag):
if bs_tag:
return bs_tag.get_text().strip().replace('\n', '').replace('\t', '')
else:
return ''
def get_link(bs_tag):
if bs_tag:
return bs_tag['href']
else:
return ''
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chromedriver, options=options)
driver.implicitly_wait(5)
cards = []
pages = 5
for i in range(pages):
url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
for anuncio in anuncios:
card = {}
card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
card['location'] = get_text(anuncio.find('p', class_="detail-region"))
card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
card['link'] = get_link(anuncio.find('a', href=True))
if len(card['value']):
cards.append(card)
dataset = pd.DataFrame(cards)
anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
soup.find()
调用未找到任何内容,因此返回 None
。
所以你实际上是在说这个:
anuncios = None.find_all('li', class_="item")
这当然行不通。
您不必为此使用 Selenium,所有数据都存储在脚本标签中,可以像这样轻松抓取:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
final = []
for page in range(1,5):
url = f'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o={page}'
landing_page = requests.get(url,headers=headers)
print(f'Scraping page {page}')
soup = BeautifulSoup(landing_page.text,'html.parser')
dirty = soup.find('script',{'id':'initial-data'})['data-json']
clean = json.loads(dirty.replace('"','"'))
data = clean['listingProps']['adList']
for listing in data:
try:
listing.pop('images') #clean up csv
listing.pop('properties')
except:
continue #some listings don't have images/properties
final.append(listing)
df = pd.DataFrame(final)
df.to_csv('output.csv',index=False)