从 realtor.com 中抓取更多列表数据页面

Question

我有下面的代码。它使用 beautiful soup 按邮政编码从 realtor.com 中提取房屋列表数据。下面的代码将提取前 47 个列表，但不会提取下一页的任何内容。例如，对于邮政编码 94016 以下的测试用例，有 2000 个左右的列表。页面底部有一个按钮，我可以点击 'next'。我想我可以尝试添加一些代码来使用 selenium 单击它。有谁知道拉出所有列表的更巧妙的方法吗？

代码：

from bs4 import BeautifulSoup
import requests

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
                                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                                  'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
#'Accept-Encoding': 'identity'

url = 'https://www.realtor.com/realestateandhomes-search/94016'

response=requests.get(url,headers=headers)

#print(response.content)

soup=BeautifulSoup(response.content,'lxml')


for item in soup.select('.component_property-card'):
    try:
        print('**********')
        #print(item)
        print(item.select('[data-label=pc-price]')[0].get_text())
        print(item.select('img')[0]['data-src'])
        print(item.select('.summary-wrap')[0].get_text())
        print(item.select('.address')[0].get_text())
        print(item.select('.property-meta')[0].get_text())
        print(item.select('.special-feature-list')[0].get_text())
        
        price_list.append(item.select('[data-label=pc-price]')[0].get_text())
        
        
        
    except Exception as e:
        #raise e
        print('')

更新：

# first run
soup_list=[]

from bs4 import BeautifulSoup
import requests

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
                                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                                  'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
#'Accept-Encoding': 'identity'

# url = 'https://www.realtor.com/realestateandhomes-search/San-Francisco_CA'

url = 'https://www.realtor.com/realestateandhomes-search/94016'

response=requests.get(url,headers=headers)

#print(response.content)

soup=BeautifulSoup(response.content,'lxml')

i=2

print(url)

print('lenght: '+str(len(soup.select('.component_property-card')[0])))


print(str(i))




while len(soup.select('.component_property-card'))!=0:

    
    
    
    try:
    
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
                                      'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                                      'Chrome/88.0.4324.150 Safari/537.36',
    'Accept-Encoding': 'identity'
    }

        # waits between pulling data
        time.sleep(np.random.randint(low=60, high=70, size=1)[0])

        url = 'https://www.realtor.com/realestateandhomes-search/94016'+'/pg-'+str(i)

        print(url)

        response=requests.get(url,headers=headers)

        #print(response.content)

        soup=BeautifulSoup(response.content,'lxml')

        print('length: '+str(len(soup.select('.component_property-card')[0])))

        i=i+1

        print(str(i))

        soup_list.append(soup)
        
    except:
        
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
                                      'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                                      'Chrome/88.0.4324.150 Safari/537.36',
    'Accept-Encoding': 'identity'
    }

        # waits between pulling data
        time.sleep(np.random.randint(low=60, high=70, size=1)[0])

        url = 'https://www.realtor.com/realestateandhomes-search/94016'+'/pg-'+str(i)

        print(url)

        response=requests.get(url,headers=headers)

        #print(response.content)

        soup=BeautifulSoup(response.content,'lxml')

        print('length: '+str(len(soup.select('.component_property-card')[0])))

        i=i+1

        print(str(i))

        soup_list.append(soup)

更新：

代码：

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep


from selenium import webdriver
from selenium.webdriver.chrome.options import Options



driver = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"))


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}

url = 'https://www.realtor.com/realestateandhomes-search/94534'

# keep simple and download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script
# driver = webdriver.Chrome()
driver.get(url)
sleep(3)

# dictionary to store page title as key and html scraped as value
pages = {}

soup = BeautifulSoup(driver.page_source, "html.parser")

pages['Page 1'] = soup

for i in range(0, 4):
    driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
    sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    
    # DO SOMETHING SMARTER HERE TO FILTER WHAT IS REQUIRED, or just get the whole page...
    # I doubt you want the whole page but for a demo it shows the idea
    
    # the page title has a '|' and the first part before it is 'Page X' so we could use that
    # split the title, remove whitepace and use the first part as the key for the dictionary
    pages[soup.find('title').text.split('|')[0].strip()] = soup
    
driver.close()

print(len(pages), '\n')
print(pages.keys(), '\n\n')
for k, v in pages.items():
    # just as a check print the first (i.e., find rather than find_all) house address for each page
    print(v.find('div', class_ =\
                 'jsx-303111361 address ellipsis srp-page-address srp-address-redesign').text, '\n')

错误：

---------------------------------------------------------------------------
NoSuchElementException                    Traceback (most recent call last)
<ipython-input-3-b50566b8c5f4> in <module>()
     45 
     46 for i in range(0, 4):
---> 47     driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
     48     sleep(5)
     49     soup = BeautifulSoup(driver.page_source, "html.parser")

~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in find_element_by_xpath(self, xpath)
    391             element = driver.find_element_by_xpath('//div/td[1]')
    392         """
--> 393         return self.find_element(by=By.XPATH, value=xpath)
    394 
    395     def find_elements_by_xpath(self, xpath):

~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in find_element(self, by, value)
    964         return self.execute(Command.FIND_ELEMENT, {
    965             'using': by,
--> 966             'value': value})['value']
    967 
    968     def find_elements(self, by=By.ID, value=None):

~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
    318         response = self.command_executor.execute(driver_command, params)
    319         if response:
--> 320             self.error_handler.check_response(response)
    321             response['value'] = self._unwrap_value(
    322                 response.get('value', None))

~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
    240                 alert_text = value['alert'].get('text')
    241             raise exception_class(message, screen, stacktrace, alert_text)
--> 242         raise exception_class(message, screen, stacktrace)
    243 
    244     def _value_or_default(self, obj, key, default):

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="srp-body"]/section[1]/div[2]/div/a[8]"}
  (Session info: chrome=92.0.4515.159)
  (Driver info: chromedriver=2.42.591059 (a3d9684d10d61aa0c45f6723b327283be1ebaad8),platform=Mac OS X 10.13.6 x86_64)

Answer 1

我无法对此进行测试，因为我在对它执行 ping 操作时从该站点获得了 403。但是，如果上面的代码适用于您，那么这应该可行。循环页面。

准备就绪后显然取消注释。因为它运行 ping:

from bs4 import BeautifulSoup
import requests
from time import sleep

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
                                  'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                                  'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}

main_url = 'https://www.realtor.com/realestateandhomes-search/94016/'

pages = ['pg-2', 'pg-3', 'pg-4']

# loop pages
for page in pages:
    
    url = main_url + page
    
    print(url)

    response=requests.get(url,headers=headers)
    
    print(response.reason)

#     soup=BeautifulSoup(response.content,'lxml')

#     for item in soup.select('.component_property-card'):
#         try:
#             print('**********')
#             #print(item)
#             print(item.select('[data-label=pc-price]')[0].get_text())
#             print(item.select('img')[0]['data-src'])
#             print(item.select('.summary-wrap')[0].get_text())
#             print(item.select('.address')[0].get_text())
#             print(item.select('.property-meta')[0].get_text())
#             print(item.select('.special-feature-list')[0].get_text())

#             price_list.append(item.select('[data-label=pc-price]')[0].get_text())



#         except Exception as e:
#             #raise e
#             print('')
    sleep(3)

对我来说：

https://www.realtor.com/realestateandhomes-search/94016/pg-2
Forbidden
https://www.realtor.com/realestateandhomes-search/94016/pg-3
Forbidden
https://www.realtor.com/realestateandhomes-search/94016/pg-4
Forbidden

对你来说可能return没问题。

请注意，我可以通过普通浏览器使用该网站，因此网址有效...

补充：

使用 Selenium，我们可以使用 '//*[@id="srp-body"]/section[1]/div[2]/div/a[8]' 的 xpath 循环页面...

...并将 HTML 抓取到汤对象中。

尝试：

from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}

url = 'https://www.realtor.com/realestateandhomes-search/94016'

# keep simple and download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script
driver = webdriver.Chrome()
driver.get(url)
sleep(3)

# dictionary to store page title as key and html scraped as value
pages = {}

soup = BeautifulSoup(driver.page_source, "html.parser")

pages['Page 1'] = soup

for i in range(0, 4):
    # driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
    driver.find_element_by_css_selector("a[aria-label^='Go to next page']").click()
    sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    
    # DO SOMETHING SMARTER HERE TO FILTER WHAT IS REQUIRED, or just get the whole page...
    # I doubt you want the whole page but for a demo it shows the idea
    
    # the page title has a '|' and the first part before it is 'Page X' so we could use that
    # split the title, remove whitepace and use the first part as the key for the dictionary
    pages[soup.find('title').text.split('|')[0].strip()] = soup
    
driver.close()

print(len(pages), '\n')
print(pages.keys(), '\n\n')
for k, v in pages.items():
    # just as a check print the first (i.e., find rather than find_all) house address for each page
    print(v.find('div', class_ =\
                 'jsx-303111361 address ellipsis srp-page-address srp-address-redesign').text, '\n')

输出：

5 

dict_keys(['Page 1', 'Page 2', 'Page 3', 'Page 4', 'Page 5']) 


316 Hazelwood Ave, San Francisco, CA 94127 

201 Sansome St Unit 801, San Francisco, CA 94104 

360 Guerrero St Apt 219, San Francisco, CA 94103 

87 Maynard St, San Francisco, CA 94112 

1023 Broadway, San Francisco, CA 94133

从 realtor.com 中抓取更多列表数据页面

scrape additional pages of listing data from realtor.com

python

beautifulsoup

request