创建一个带有循环搜索链接的网络抓取工具
Create a webscraper with loop to search links
我正在尝试创建一个 websraper,它将 return 来自网站的餐厅名称和地址。在当前版本中,它 return 只是名称(作为测试),但它们以字符串的形式保存 ([{'name': 'Copernicus Restaurant | Copernicus Hotel'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}, {'name': 'Cyrano de Bergerac'}]
)。
有人可以帮我更正此代码,以便它获取到每家餐厅的链接,然后从这些链接中提取有关餐厅名称和地址的数据吗?
如有任何帮助,我将不胜感激。
我的代码:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(productlinks)
restlist = []
for link in productlinks:
r = driver.get(link)
soup = BeautifulSoup(driver.page_source, 'lxml')
name = soup.find('h1', class_='notranslate').text.strip()
# address = soup.find('div', class_='address')
# try:
# website = soup.find('a', href=True)
# except:
# website = 'NULL'
rest ={
'name': name,
# 'website': website,
# 'address': address
}
restlist.append(rest)
print(restlist)
driver.quit()
编辑后的代码结果错误:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
driver = webdriver.Chrome()
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Gostynin-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element(By.XPATH, '//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element(By.XPATH, '//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element(By.XPATH, '//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
website = ''
rest = {
'name': name,
'website': website,
'address': address,
}
restlist.append(rest)
print(restlist)
#df = pd.DataFrame(restlist)
#df.to_csv('C:/webdrivers/restauracje.csv')
#print(df.head(10))
driver.quit()
有很多 a
和 href
所以你必须使用更复杂的方法来得到 website
.
website
在 <div class="website">
中,所以你可以做
website = soup.find('div', class_='website').find('a').get('href')
但真实的 link 餐厅是文本,而不是 href
website = soup.find('div', class_='website').find('a').text
至于地址我还必须添加额外的.find('div', class_=False)
(和 .text.strip()
)得到它
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
Selenium 有自己的方法来搜索 HTML 中的元素,也许它会 运行 更快。
name = driver.find_element_by_xpath('//h1[@class="notranslate"]').text.strip()
address = driver.find_element_by_xpath('//div[@class="address"]/div[2]').text.strip()
website = driver.find_element_by_xpath('//div[@class="website"]//a').text #get('href')
在 Linux:
上使用 Firefox 测试
在代码中我保留了两种方法:soup.find
和 driver.find_element_by_xpath
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
try:
driver = webdriver.Firefox()
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
print('[DEBUG] productlist ...')
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
print('len(productlinks):', len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element_by_xpath('//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element_by_xpath('//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element_by_xpath('//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
print('[DEBUG] Exception:', ex)
website = ''
print(website)
rest = {
'name': name,
'website': website,
'address': address,
}
print('[DEBUG] rest ...')
print(rest)
print('-----')
restlist.append(rest)
# --- after `for`-loop ---
print(restlist)
except KeyboardInterrupt:
print("KeyboardInterrupt")
finally:
driver.quit()
# open only once
with open('output.csv', 'w') as f:
csv_writer = csv.DictWriter(f, fieldnames=['name', 'website', 'address'])
csv_writer.writeheader()
csv_writer.writerows(restlist)
结果(来自print(restlist)
)
[
{'name': 'Copernicus Restaurant | Copernicus Hotel', 'website': 'https://www.likusrestauracje.pl/', 'address': 'Kanonicza 16, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Farina Restaurant', 'website': 'https://www.farina.com.pl/', 'address': 'Świętego Marka 16, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Cyrano de Bergerac', 'website': 'http://cyranodebergerac.com.pl', 'address': 'Sławkowska 26, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Amarylis Restaurant', 'website': 'https://www.queenhotel.pl/', 'address': 'Józefa Dietla 60, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Projekt Nano', 'website': '', 'address': 'Podmurna 17 A, Torun, Kuyavian-Pomeranian Voivodeship, Poland'},
{'name': 'Raffles Europejski Warsaw', 'website': 'https://www.raffles.com/warsaw', 'address': 'Nowy Świat-Uniwersytet'},
{'name': 'Caffe Horst', 'website': 'http://www.caffehorst.pl/', 'address': 'Świętochłowicka 6, Bytom, Silesian Voivodeship, Poland'},
{'name': 'Proza', 'website': '', 'address': 'Jana Karola Chodkiewicza 7, Rzeszow, Podkarpackie Voivodeship, Poland'},
{'name': 'Il Posto di Luca Santarossa', 'website': 'http://www.ilposto.pl', 'address': 'Jana Sawy 5/lokal 10, Lublin, Lublin Voivodeship, Poland'},
{'name': 'Balkan Bistro Prespa', 'website': '', 'address': 'Władysława Syrokomli 8, Warsaw, Masovian Voivodeship, Poland'},
{'name': 'Mr Coffee', 'website': '', 'address': 'Tumska 4, Klodzko, Lower Silesian Voivodeship, Poland'},
{'name': 'Bottiglieria 1881 Restaurant', 'website': 'https://www.1881.com.pl/', 'address': 'Bocheńska 5, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Albertina Restaurant & Wine', 'website': 'https://www.albertinarestaurant.pl/', 'address': 'Dominikańska 3, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Pub & Restauracja „W Sercu Łodzi”', 'website': '', 'address': 'al. Marszałka Józefa Piłsudskiego 138, Łódź, Łódź Voivodeship, Poland'},
{'name': '#Alternatywnie', 'website': 'http://www.altcoffee.pl/', 'address': 'aleja Wojska Polskiego 35/u3, Szczecin, West Pomeranian Voivodeship, Poland'},
{'name': 'Aqua e Vino', 'website': 'http://www.aquaevino.pl', 'address': 'Wiślna 5/10, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Pili Pili Gdańsk', 'website': 'http://www.pilipilicafe.com/', 'address': 'Szafarnia 11/U14, Gdańsk, Pomeranian Voivodeship, Poland'},
{'name': 'Kawiarnia Coffeinna', 'website': '', 'address': '1 Maja 26, Jastrzębie-Zdrój, Silesian Voivodeship, Poland'},
{'name': 'Mleczarnia', 'website': 'http://www.mle.pl', 'address': 'Rabina, Beera Meiselsa 20, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'La Squadra Ristorante', 'website': 'http://lasquadra.pl/restauracja/', 'address': 'Bocheńskiego 109, Katowice, Silesian Voivodeship, Poland'}
]
我正在尝试创建一个 websraper,它将 return 来自网站的餐厅名称和地址。在当前版本中,它 return 只是名称(作为测试),但它们以字符串的形式保存 ([{'name': 'Copernicus Restaurant | Copernicus Hotel'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}] [{'name': 'Copernicus Restaurant | Copernicus Hotel'}, {'name': 'Farina Restaurant'}, {'name': 'Cyrano de Bergerac'}]
)。
有人可以帮我更正此代码,以便它获取到每家餐厅的链接,然后从这些链接中提取有关餐厅名称和地址的数据吗?
如有任何帮助,我将不胜感激。
我的代码:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(productlinks)
restlist = []
for link in productlinks:
r = driver.get(link)
soup = BeautifulSoup(driver.page_source, 'lxml')
name = soup.find('h1', class_='notranslate').text.strip()
# address = soup.find('div', class_='address')
# try:
# website = soup.find('a', href=True)
# except:
# website = 'NULL'
rest ={
'name': name,
# 'website': website,
# 'address': address
}
restlist.append(rest)
print(restlist)
driver.quit()
编辑后的代码结果错误:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
driver = webdriver.Chrome()
baseurl = 'https://restaurantguru.com/'
driver.get('https://restaurantguru.com/restaurant-Gostynin-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
#print(len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element(By.XPATH, '//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element(By.XPATH, '//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element(By.XPATH, '//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
website = ''
rest = {
'name': name,
'website': website,
'address': address,
}
restlist.append(rest)
print(restlist)
#df = pd.DataFrame(restlist)
#df.to_csv('C:/webdrivers/restauracje.csv')
#print(df.head(10))
driver.quit()
有很多 a
和 href
所以你必须使用更复杂的方法来得到 website
.
website
在 <div class="website">
中,所以你可以做
website = soup.find('div', class_='website').find('a').get('href')
但真实的 link 餐厅是文本,而不是 href
website = soup.find('div', class_='website').find('a').text
至于地址我还必须添加额外的.find('div', class_=False)
(和 .text.strip()
)得到它
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
Selenium 有自己的方法来搜索 HTML 中的元素,也许它会 运行 更快。
name = driver.find_element_by_xpath('//h1[@class="notranslate"]').text.strip()
address = driver.find_element_by_xpath('//div[@class="address"]/div[2]').text.strip()
website = driver.find_element_by_xpath('//div[@class="website"]//a').text #get('href')
在 Linux:
上使用 Firefox 测试在代码中我保留了两种方法:soup.find
和 driver.find_element_by_xpath
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
#driver_service = Service(executable_path="C:/webdrivers/chromedriver.exe")
#driver = webdriver.Chrome(service=driver_service)
try:
driver = webdriver.Firefox()
driver.get('https://restaurantguru.com/restaurant-Poland-t1')
soup = BeautifulSoup(driver.page_source, 'lxml')
productlist = soup.find_all('div', class_='wrapper_info')
#print(productlist)
print('[DEBUG] productlist ...')
productlinks = []
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(link['href'])
print('len(productlinks):', len(productlinks))
restlist = []
for link in productlinks:
print('[DEBUG] link:', link)
driver.get(link)
print('[DEBUG] soup ...')
soup = BeautifulSoup(driver.page_source, 'lxml')
print('[DEBUG] name ...')
name = soup.find('h1', class_='notranslate').text.strip()
print(name)
name = driver.find_element_by_xpath('//h1[@class="notranslate"]').text.strip()
print(name)
print('[DEBUG] address ...')
address = soup.find('div', class_='address').find('div', class_=False).text.strip()
print(address)
address = driver.find_element_by_xpath('//div[@class="address"]/div[2]').text.strip()
print(address)
print('[DEBUG] website ...')
try:
website = soup.find('div', class_='website').find('a').text #get('href')
print(website)
website = driver.find_element_by_xpath('//div[@class="website"]//a').text #get('href')
print(website)
except Exception as ex:
print('[DEBUG] Exception:', ex)
website = ''
print(website)
rest = {
'name': name,
'website': website,
'address': address,
}
print('[DEBUG] rest ...')
print(rest)
print('-----')
restlist.append(rest)
# --- after `for`-loop ---
print(restlist)
except KeyboardInterrupt:
print("KeyboardInterrupt")
finally:
driver.quit()
# open only once
with open('output.csv', 'w') as f:
csv_writer = csv.DictWriter(f, fieldnames=['name', 'website', 'address'])
csv_writer.writeheader()
csv_writer.writerows(restlist)
结果(来自print(restlist)
)
[
{'name': 'Copernicus Restaurant | Copernicus Hotel', 'website': 'https://www.likusrestauracje.pl/', 'address': 'Kanonicza 16, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Farina Restaurant', 'website': 'https://www.farina.com.pl/', 'address': 'Świętego Marka 16, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Cyrano de Bergerac', 'website': 'http://cyranodebergerac.com.pl', 'address': 'Sławkowska 26, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Amarylis Restaurant', 'website': 'https://www.queenhotel.pl/', 'address': 'Józefa Dietla 60, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Projekt Nano', 'website': '', 'address': 'Podmurna 17 A, Torun, Kuyavian-Pomeranian Voivodeship, Poland'},
{'name': 'Raffles Europejski Warsaw', 'website': 'https://www.raffles.com/warsaw', 'address': 'Nowy Świat-Uniwersytet'},
{'name': 'Caffe Horst', 'website': 'http://www.caffehorst.pl/', 'address': 'Świętochłowicka 6, Bytom, Silesian Voivodeship, Poland'},
{'name': 'Proza', 'website': '', 'address': 'Jana Karola Chodkiewicza 7, Rzeszow, Podkarpackie Voivodeship, Poland'},
{'name': 'Il Posto di Luca Santarossa', 'website': 'http://www.ilposto.pl', 'address': 'Jana Sawy 5/lokal 10, Lublin, Lublin Voivodeship, Poland'},
{'name': 'Balkan Bistro Prespa', 'website': '', 'address': 'Władysława Syrokomli 8, Warsaw, Masovian Voivodeship, Poland'},
{'name': 'Mr Coffee', 'website': '', 'address': 'Tumska 4, Klodzko, Lower Silesian Voivodeship, Poland'},
{'name': 'Bottiglieria 1881 Restaurant', 'website': 'https://www.1881.com.pl/', 'address': 'Bocheńska 5, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Albertina Restaurant & Wine', 'website': 'https://www.albertinarestaurant.pl/', 'address': 'Dominikańska 3, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Pub & Restauracja „W Sercu Łodzi”', 'website': '', 'address': 'al. Marszałka Józefa Piłsudskiego 138, Łódź, Łódź Voivodeship, Poland'},
{'name': '#Alternatywnie', 'website': 'http://www.altcoffee.pl/', 'address': 'aleja Wojska Polskiego 35/u3, Szczecin, West Pomeranian Voivodeship, Poland'},
{'name': 'Aqua e Vino', 'website': 'http://www.aquaevino.pl', 'address': 'Wiślna 5/10, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'Pili Pili Gdańsk', 'website': 'http://www.pilipilicafe.com/', 'address': 'Szafarnia 11/U14, Gdańsk, Pomeranian Voivodeship, Poland'},
{'name': 'Kawiarnia Coffeinna', 'website': '', 'address': '1 Maja 26, Jastrzębie-Zdrój, Silesian Voivodeship, Poland'},
{'name': 'Mleczarnia', 'website': 'http://www.mle.pl', 'address': 'Rabina, Beera Meiselsa 20, Kraków, Lesser Poland Voivodeship, Poland'},
{'name': 'La Squadra Ristorante', 'website': 'http://lasquadra.pl/restauracja/', 'address': 'Bocheńskiego 109, Katowice, Silesian Voivodeship, Poland'}
]