Python web scraping "IndexError: list index out of range"
Python web scraping "IndexError: list index out of range"
from selenium import webdriver
import csv
import requests
from bs4 import BeautifulSoup
driver = webdriver.Chrome(executable_path="C:\Users\dylan\Documents\chromedriver.exe")
data_list=[]
site = requests.get('https://www.visitnh.gov/things-to-do/food-and-drink/restaurants')
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
Resultswrapper = content.find_all('div', attrs={'results-wrapper'})
for Results in Resultswrapper:
print("Random phrase: ")
#print(Results.select(class_='results-wrapper').prettify())
BusinessName = Results.select('.item-title.ng-binding')[0].get_text()
Address = Results.select('.ng-binding')[0].get_text()
PhoneNumber = Results.select('.ng-binding')[0].get_text()
new_data = {"BusinessName": BusinessName, "Address": Address, "PhoneNumber": PhoneNumber}
data_list.append(new_data)
print("data: " + data_list)
print("new data: " + new_data)
with open ('find.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["BusinessName", "Address", "PhoneNumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
我在尝试制作一个简单的网络抓取工具时遇到索引超出范围错误,我正在尝试使用 selector 进行循环。我用这个 URL: https://www.visitnh.gov/things-to-do/food-and-drink/restaurants
Traceback (most recent call last):
File "c:/Users/dylan/Documents/Webscrape/web-s.py", line 19, in <module>
BusinessName = Results.select('p.item-title.ng-binding')[0].get_text()
IndexError: list index out of range
我尝试将结果包装器更改为 HTMl 中不同的东西,但它不一样。我也试过弄乱 select 中的文本,但没有用。有任何想法吗?任何帮助将不胜感激。
查看该网站,您可以看到您尝试 select 的元素在文档首次准备就绪时不存在。本网站的餐厅列表是稍后插入到页面中的,这就是为什么您的脚本无法立即找到它的原因。
也许在 select 使用内置 time.sleep
函数之前稍等片刻。
数据是通过 javascript 从外部 URL 加载的。你可以使用requests
/json
模块来模拟这个请求:
import json
import requests
url = 'https://www.visitnh.gov/BusinessListingService.asmx/GetUsers'
search_params = {"searchParams":{"ResultsPerPage":15,"BusinessType":"attraction","SubcategoryID":25}}
data = json.loads(requests.post(url, json=search_params).json()['d'])
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for result in data['Results']:
print(result['BusinessName'])
print(result['Address'])
print(result['Phone'])
print('-' * 80)
打印:
NazBar & Grill
1086 Weirs Blvd, Laconia, NH 03246
(603) 366-4341
--------------------------------------------------------------------------------
Giant of Siam
5 E Hollis Street, Nashua, NH 03060
(603) 595-2222
--------------------------------------------------------------------------------
Hobbs Tavern & Brewing Company
2415 NH Route 16, Ossipee, NH 03890
(603) 539-2000
--------------------------------------------------------------------------------
...and so on. (Total 164 items.)
from selenium import webdriver
import csv
import requests
from bs4 import BeautifulSoup
driver = webdriver.Chrome(executable_path="C:\Users\dylan\Documents\chromedriver.exe")
data_list=[]
site = requests.get('https://www.visitnh.gov/things-to-do/food-and-drink/restaurants')
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
Resultswrapper = content.find_all('div', attrs={'results-wrapper'})
for Results in Resultswrapper:
print("Random phrase: ")
#print(Results.select(class_='results-wrapper').prettify())
BusinessName = Results.select('.item-title.ng-binding')[0].get_text()
Address = Results.select('.ng-binding')[0].get_text()
PhoneNumber = Results.select('.ng-binding')[0].get_text()
new_data = {"BusinessName": BusinessName, "Address": Address, "PhoneNumber": PhoneNumber}
data_list.append(new_data)
print("data: " + data_list)
print("new data: " + new_data)
with open ('find.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["BusinessName", "Address", "PhoneNumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
我在尝试制作一个简单的网络抓取工具时遇到索引超出范围错误,我正在尝试使用 selector 进行循环。我用这个 URL: https://www.visitnh.gov/things-to-do/food-and-drink/restaurants
Traceback (most recent call last):
File "c:/Users/dylan/Documents/Webscrape/web-s.py", line 19, in <module>
BusinessName = Results.select('p.item-title.ng-binding')[0].get_text()
IndexError: list index out of range
我尝试将结果包装器更改为 HTMl 中不同的东西,但它不一样。我也试过弄乱 select 中的文本,但没有用。有任何想法吗?任何帮助将不胜感激。
查看该网站,您可以看到您尝试 select 的元素在文档首次准备就绪时不存在。本网站的餐厅列表是稍后插入到页面中的,这就是为什么您的脚本无法立即找到它的原因。
也许在 select 使用内置 time.sleep
函数之前稍等片刻。
数据是通过 javascript 从外部 URL 加载的。你可以使用requests
/json
模块来模拟这个请求:
import json
import requests
url = 'https://www.visitnh.gov/BusinessListingService.asmx/GetUsers'
search_params = {"searchParams":{"ResultsPerPage":15,"BusinessType":"attraction","SubcategoryID":25}}
data = json.loads(requests.post(url, json=search_params).json()['d'])
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for result in data['Results']:
print(result['BusinessName'])
print(result['Address'])
print(result['Phone'])
print('-' * 80)
打印:
NazBar & Grill
1086 Weirs Blvd, Laconia, NH 03246
(603) 366-4341
--------------------------------------------------------------------------------
Giant of Siam
5 E Hollis Street, Nashua, NH 03060
(603) 595-2222
--------------------------------------------------------------------------------
Hobbs Tavern & Brewing Company
2415 NH Route 16, Ossipee, NH 03890
(603) 539-2000
--------------------------------------------------------------------------------
...and so on. (Total 164 items.)