AttributeError: 'NoneType' object has no attribute 'find_all' (BeautifulSoup) (ChromeDriver)

AttributeError: 'NoneType' object has no attribute 'find_all' (BeautifulSoup) (ChromeDriver)

我正在尝试 运行 一些代码,但我在页面的循环部分收到错误消息 AttributeError: 'NoneType' object has no attribute 'find_all'。我认为 JS 检测到我正在使用 ChromeDriver 并阻止对网页的请求。

关于做什么的建议?

cards = []
pages = 5

for i in range(pages):
    url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")

    for anuncio in anuncios:
        card = {}

        card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
        card['location'] = get_text(anuncio.find('p', class_="detail-region"))
        card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
        card['link'] = get_link(anuncio.find('a', href=True))

        if len(card['value']):
            cards.append(card)

dataset = pd.DataFrame(cards)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_11539/2840841130.py in <module>
      7 
      8     soup = BeautifulSoup(driver.page_source, 'html.parser')
----> 9     anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
     10 
     11     for anuncio in anuncios:

AttributeError: 'NoneType' object has no attribute 'find_all'

完整代码

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

def get_text(bs_tag):
    if bs_tag:
        return bs_tag.get_text().strip().replace('\n', '').replace('\t', '')
    else:
        return ''

def get_link(bs_tag):
    if bs_tag:
        return bs_tag['href']
    else:
        return ''

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chromedriver, options=options)

driver.implicitly_wait(5)

cards = []
pages = 5

for i in range(pages):
    url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o=' +str(i+1)
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")

    for anuncio in anuncios:
        card = {}

        card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
        card['location'] = get_text(anuncio.find('p', class_="detail-region"))
        card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
        card['link'] = get_link(anuncio.find('a', href=True))

        if len(card['value']):
            cards.append(card)

dataset = pd.DataFrame(cards)

anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")

soup.find() 调用未找到任何内容,因此返回 None

所以你实际上是在说这个:

anuncios = None.find_all('li', class_="item")

这当然行不通。

您不必为此使用 Selenium,所有数据都存储在脚本标签中,可以像这样轻松抓取:

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}

final = []
for page in range(1,5):

    url = f'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o={page}'

    landing_page = requests.get(url,headers=headers)
    print(f'Scraping page {page}')
    soup = BeautifulSoup(landing_page.text,'html.parser')

    dirty = soup.find('script',{'id':'initial-data'})['data-json']
    clean = json.loads(dirty.replace('&quot;','"'))

    data = clean['listingProps']['adList']
    for listing in data:

        try:
            listing.pop('images') #clean up csv
            listing.pop('properties')
        except:
            continue #some listings don't have images/properties

        final.append(listing)

df = pd.DataFrame(final)
df.to_csv('output.csv',index=False)