从 realtor.com 中抓取更多列表数据页面
scrape additional pages of listing data from realtor.com
我有下面的代码。它使用 beautiful soup 按邮政编码从 realtor.com 中提取房屋列表数据。下面的代码将提取前 47 个列表,但不会提取下一页的任何内容。例如,对于邮政编码 94016 以下的测试用例,有 2000 个左右的列表。页面底部有一个按钮,我可以点击 'next'。我想我可以尝试添加一些代码来使用 selenium 单击它。有谁知道拉出所有列表的更巧妙的方法吗?
代码:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
#'Accept-Encoding': 'identity'
url = 'https://www.realtor.com/realestateandhomes-search/94016'
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
for item in soup.select('.component_property-card'):
try:
print('**********')
#print(item)
print(item.select('[data-label=pc-price]')[0].get_text())
print(item.select('img')[0]['data-src'])
print(item.select('.summary-wrap')[0].get_text())
print(item.select('.address')[0].get_text())
print(item.select('.property-meta')[0].get_text())
print(item.select('.special-feature-list')[0].get_text())
price_list.append(item.select('[data-label=pc-price]')[0].get_text())
except Exception as e:
#raise e
print('')
更新:
# first run
soup_list=[]
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
#'Accept-Encoding': 'identity'
# url = 'https://www.realtor.com/realestateandhomes-search/San-Francisco_CA'
url = 'https://www.realtor.com/realestateandhomes-search/94016'
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
i=2
print(url)
print('lenght: '+str(len(soup.select('.component_property-card')[0])))
print(str(i))
while len(soup.select('.component_property-card'))!=0:
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
# waits between pulling data
time.sleep(np.random.randint(low=60, high=70, size=1)[0])
url = 'https://www.realtor.com/realestateandhomes-search/94016'+'/pg-'+str(i)
print(url)
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
print('length: '+str(len(soup.select('.component_property-card')[0])))
i=i+1
print(str(i))
soup_list.append(soup)
except:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
# waits between pulling data
time.sleep(np.random.randint(low=60, high=70, size=1)[0])
url = 'https://www.realtor.com/realestateandhomes-search/94016'+'/pg-'+str(i)
print(url)
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
print('length: '+str(len(soup.select('.component_property-card')[0])))
i=i+1
print(str(i))
soup_list.append(soup)
更新:
代码:
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
driver = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
url = 'https://www.realtor.com/realestateandhomes-search/94534'
# keep simple and download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script
# driver = webdriver.Chrome()
driver.get(url)
sleep(3)
# dictionary to store page title as key and html scraped as value
pages = {}
soup = BeautifulSoup(driver.page_source, "html.parser")
pages['Page 1'] = soup
for i in range(0, 4):
driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
# DO SOMETHING SMARTER HERE TO FILTER WHAT IS REQUIRED, or just get the whole page...
# I doubt you want the whole page but for a demo it shows the idea
# the page title has a '|' and the first part before it is 'Page X' so we could use that
# split the title, remove whitepace and use the first part as the key for the dictionary
pages[soup.find('title').text.split('|')[0].strip()] = soup
driver.close()
print(len(pages), '\n')
print(pages.keys(), '\n\n')
for k, v in pages.items():
# just as a check print the first (i.e., find rather than find_all) house address for each page
print(v.find('div', class_ =\
'jsx-303111361 address ellipsis srp-page-address srp-address-redesign').text, '\n')
错误:
---------------------------------------------------------------------------
NoSuchElementException Traceback (most recent call last)
<ipython-input-3-b50566b8c5f4> in <module>()
45
46 for i in range(0, 4):
---> 47 driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
48 sleep(5)
49 soup = BeautifulSoup(driver.page_source, "html.parser")
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in find_element_by_xpath(self, xpath)
391 element = driver.find_element_by_xpath('//div/td[1]')
392 """
--> 393 return self.find_element(by=By.XPATH, value=xpath)
394
395 def find_elements_by_xpath(self, xpath):
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in find_element(self, by, value)
964 return self.execute(Command.FIND_ELEMENT, {
965 'using': by,
--> 966 'value': value})['value']
967
968 def find_elements(self, by=By.ID, value=None):
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
318 response = self.command_executor.execute(driver_command, params)
319 if response:
--> 320 self.error_handler.check_response(response)
321 response['value'] = self._unwrap_value(
322 response.get('value', None))
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
240 alert_text = value['alert'].get('text')
241 raise exception_class(message, screen, stacktrace, alert_text)
--> 242 raise exception_class(message, screen, stacktrace)
243
244 def _value_or_default(self, obj, key, default):
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="srp-body"]/section[1]/div[2]/div/a[8]"}
(Session info: chrome=92.0.4515.159)
(Driver info: chromedriver=2.42.591059 (a3d9684d10d61aa0c45f6723b327283be1ebaad8),platform=Mac OS X 10.13.6 x86_64)
我无法对此进行测试,因为我在对它执行 ping 操作时从该站点获得了 403
。但是,如果上面的代码适用于您,那么这应该可行。循环页面。
准备就绪后显然取消注释。因为它运行 ping:
from bs4 import BeautifulSoup
import requests
from time import sleep
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
main_url = 'https://www.realtor.com/realestateandhomes-search/94016/'
pages = ['pg-2', 'pg-3', 'pg-4']
# loop pages
for page in pages:
url = main_url + page
print(url)
response=requests.get(url,headers=headers)
print(response.reason)
# soup=BeautifulSoup(response.content,'lxml')
# for item in soup.select('.component_property-card'):
# try:
# print('**********')
# #print(item)
# print(item.select('[data-label=pc-price]')[0].get_text())
# print(item.select('img')[0]['data-src'])
# print(item.select('.summary-wrap')[0].get_text())
# print(item.select('.address')[0].get_text())
# print(item.select('.property-meta')[0].get_text())
# print(item.select('.special-feature-list')[0].get_text())
# price_list.append(item.select('[data-label=pc-price]')[0].get_text())
# except Exception as e:
# #raise e
# print('')
sleep(3)
对我来说:
https://www.realtor.com/realestateandhomes-search/94016/pg-2
Forbidden
https://www.realtor.com/realestateandhomes-search/94016/pg-3
Forbidden
https://www.realtor.com/realestateandhomes-search/94016/pg-4
Forbidden
对你来说可能return没问题。
请注意,我可以通过普通浏览器使用该网站,因此网址有效...
补充:
使用 Selenium,我们可以使用 '//*[@id="srp-body"]/section[1]/div[2]/div/a[8]'
的 xpath 循环页面...
...并将 HTML 抓取到汤对象中。
尝试:
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
url = 'https://www.realtor.com/realestateandhomes-search/94016'
# keep simple and download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script
driver = webdriver.Chrome()
driver.get(url)
sleep(3)
# dictionary to store page title as key and html scraped as value
pages = {}
soup = BeautifulSoup(driver.page_source, "html.parser")
pages['Page 1'] = soup
for i in range(0, 4):
# driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
driver.find_element_by_css_selector("a[aria-label^='Go to next page']").click()
sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
# DO SOMETHING SMARTER HERE TO FILTER WHAT IS REQUIRED, or just get the whole page...
# I doubt you want the whole page but for a demo it shows the idea
# the page title has a '|' and the first part before it is 'Page X' so we could use that
# split the title, remove whitepace and use the first part as the key for the dictionary
pages[soup.find('title').text.split('|')[0].strip()] = soup
driver.close()
print(len(pages), '\n')
print(pages.keys(), '\n\n')
for k, v in pages.items():
# just as a check print the first (i.e., find rather than find_all) house address for each page
print(v.find('div', class_ =\
'jsx-303111361 address ellipsis srp-page-address srp-address-redesign').text, '\n')
输出:
5
dict_keys(['Page 1', 'Page 2', 'Page 3', 'Page 4', 'Page 5'])
316 Hazelwood Ave, San Francisco, CA 94127
201 Sansome St Unit 801, San Francisco, CA 94104
360 Guerrero St Apt 219, San Francisco, CA 94103
87 Maynard St, San Francisco, CA 94112
1023 Broadway, San Francisco, CA 94133
我有下面的代码。它使用 beautiful soup 按邮政编码从 realtor.com 中提取房屋列表数据。下面的代码将提取前 47 个列表,但不会提取下一页的任何内容。例如,对于邮政编码 94016 以下的测试用例,有 2000 个左右的列表。页面底部有一个按钮,我可以点击 'next'。我想我可以尝试添加一些代码来使用 selenium 单击它。有谁知道拉出所有列表的更巧妙的方法吗?
代码:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
#'Accept-Encoding': 'identity'
url = 'https://www.realtor.com/realestateandhomes-search/94016'
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
for item in soup.select('.component_property-card'):
try:
print('**********')
#print(item)
print(item.select('[data-label=pc-price]')[0].get_text())
print(item.select('img')[0]['data-src'])
print(item.select('.summary-wrap')[0].get_text())
print(item.select('.address')[0].get_text())
print(item.select('.property-meta')[0].get_text())
print(item.select('.special-feature-list')[0].get_text())
price_list.append(item.select('[data-label=pc-price]')[0].get_text())
except Exception as e:
#raise e
print('')
更新:
# first run
soup_list=[]
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
#'Accept-Encoding': 'identity'
# url = 'https://www.realtor.com/realestateandhomes-search/San-Francisco_CA'
url = 'https://www.realtor.com/realestateandhomes-search/94016'
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
i=2
print(url)
print('lenght: '+str(len(soup.select('.component_property-card')[0])))
print(str(i))
while len(soup.select('.component_property-card'))!=0:
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
# waits between pulling data
time.sleep(np.random.randint(low=60, high=70, size=1)[0])
url = 'https://www.realtor.com/realestateandhomes-search/94016'+'/pg-'+str(i)
print(url)
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
print('length: '+str(len(soup.select('.component_property-card')[0])))
i=i+1
print(str(i))
soup_list.append(soup)
except:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
# waits between pulling data
time.sleep(np.random.randint(low=60, high=70, size=1)[0])
url = 'https://www.realtor.com/realestateandhomes-search/94016'+'/pg-'+str(i)
print(url)
response=requests.get(url,headers=headers)
#print(response.content)
soup=BeautifulSoup(response.content,'lxml')
print('length: '+str(len(soup.select('.component_property-card')[0])))
i=i+1
print(str(i))
soup_list.append(soup)
更新:
代码:
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
driver = webdriver.Chrome(executable_path=os.path.abspath("chromedriver"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
url = 'https://www.realtor.com/realestateandhomes-search/94534'
# keep simple and download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script
# driver = webdriver.Chrome()
driver.get(url)
sleep(3)
# dictionary to store page title as key and html scraped as value
pages = {}
soup = BeautifulSoup(driver.page_source, "html.parser")
pages['Page 1'] = soup
for i in range(0, 4):
driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
# DO SOMETHING SMARTER HERE TO FILTER WHAT IS REQUIRED, or just get the whole page...
# I doubt you want the whole page but for a demo it shows the idea
# the page title has a '|' and the first part before it is 'Page X' so we could use that
# split the title, remove whitepace and use the first part as the key for the dictionary
pages[soup.find('title').text.split('|')[0].strip()] = soup
driver.close()
print(len(pages), '\n')
print(pages.keys(), '\n\n')
for k, v in pages.items():
# just as a check print the first (i.e., find rather than find_all) house address for each page
print(v.find('div', class_ =\
'jsx-303111361 address ellipsis srp-page-address srp-address-redesign').text, '\n')
错误:
---------------------------------------------------------------------------
NoSuchElementException Traceback (most recent call last)
<ipython-input-3-b50566b8c5f4> in <module>()
45
46 for i in range(0, 4):
---> 47 driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
48 sleep(5)
49 soup = BeautifulSoup(driver.page_source, "html.parser")
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in find_element_by_xpath(self, xpath)
391 element = driver.find_element_by_xpath('//div/td[1]')
392 """
--> 393 return self.find_element(by=By.XPATH, value=xpath)
394
395 def find_elements_by_xpath(self, xpath):
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in find_element(self, by, value)
964 return self.execute(Command.FIND_ELEMENT, {
965 'using': by,
--> 966 'value': value})['value']
967
968 def find_elements(self, by=By.ID, value=None):
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
318 response = self.command_executor.execute(driver_command, params)
319 if response:
--> 320 self.error_handler.check_response(response)
321 response['value'] = self._unwrap_value(
322 response.get('value', None))
~/anaconda/envs/py36/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
240 alert_text = value['alert'].get('text')
241 raise exception_class(message, screen, stacktrace, alert_text)
--> 242 raise exception_class(message, screen, stacktrace)
243
244 def _value_or_default(self, obj, key, default):
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="srp-body"]/section[1]/div[2]/div/a[8]"}
(Session info: chrome=92.0.4515.159)
(Driver info: chromedriver=2.42.591059 (a3d9684d10d61aa0c45f6723b327283be1ebaad8),platform=Mac OS X 10.13.6 x86_64)
我无法对此进行测试,因为我在对它执行 ping 操作时从该站点获得了 403
。但是,如果上面的代码适用于您,那么这应该可行。循环页面。
准备就绪后显然取消注释。因为它运行 ping:
from bs4 import BeautifulSoup
import requests
from time import sleep
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/88.0.4324.150 Safari/537.36',
'Accept-Encoding': 'identity'
}
main_url = 'https://www.realtor.com/realestateandhomes-search/94016/'
pages = ['pg-2', 'pg-3', 'pg-4']
# loop pages
for page in pages:
url = main_url + page
print(url)
response=requests.get(url,headers=headers)
print(response.reason)
# soup=BeautifulSoup(response.content,'lxml')
# for item in soup.select('.component_property-card'):
# try:
# print('**********')
# #print(item)
# print(item.select('[data-label=pc-price]')[0].get_text())
# print(item.select('img')[0]['data-src'])
# print(item.select('.summary-wrap')[0].get_text())
# print(item.select('.address')[0].get_text())
# print(item.select('.property-meta')[0].get_text())
# print(item.select('.special-feature-list')[0].get_text())
# price_list.append(item.select('[data-label=pc-price]')[0].get_text())
# except Exception as e:
# #raise e
# print('')
sleep(3)
对我来说:
https://www.realtor.com/realestateandhomes-search/94016/pg-2
Forbidden
https://www.realtor.com/realestateandhomes-search/94016/pg-3
Forbidden
https://www.realtor.com/realestateandhomes-search/94016/pg-4
Forbidden
对你来说可能return没问题。
请注意,我可以通过普通浏览器使用该网站,因此网址有效...
补充:
使用 Selenium,我们可以使用 '//*[@id="srp-body"]/section[1]/div[2]/div/a[8]'
的 xpath 循环页面...
...并将 HTML 抓取到汤对象中。
尝试:
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
url = 'https://www.realtor.com/realestateandhomes-search/94016'
# keep simple and download from https://chromedriver.chromium.org/downloads (match version of Chrome installed)
# put file in same folder as the script
driver = webdriver.Chrome()
driver.get(url)
sleep(3)
# dictionary to store page title as key and html scraped as value
pages = {}
soup = BeautifulSoup(driver.page_source, "html.parser")
pages['Page 1'] = soup
for i in range(0, 4):
# driver.find_element_by_xpath('//*[@id="srp-body"]/section[1]/div[2]/div/a[8]').click()
driver.find_element_by_css_selector("a[aria-label^='Go to next page']").click()
sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
# DO SOMETHING SMARTER HERE TO FILTER WHAT IS REQUIRED, or just get the whole page...
# I doubt you want the whole page but for a demo it shows the idea
# the page title has a '|' and the first part before it is 'Page X' so we could use that
# split the title, remove whitepace and use the first part as the key for the dictionary
pages[soup.find('title').text.split('|')[0].strip()] = soup
driver.close()
print(len(pages), '\n')
print(pages.keys(), '\n\n')
for k, v in pages.items():
# just as a check print the first (i.e., find rather than find_all) house address for each page
print(v.find('div', class_ =\
'jsx-303111361 address ellipsis srp-page-address srp-address-redesign').text, '\n')
输出:
5
dict_keys(['Page 1', 'Page 2', 'Page 3', 'Page 4', 'Page 5'])
316 Hazelwood Ave, San Francisco, CA 94127
201 Sansome St Unit 801, San Francisco, CA 94104
360 Guerrero St Apt 219, San Francisco, CA 94103
87 Maynard St, San Francisco, CA 94112
1023 Broadway, San Francisco, CA 94133