如何在可以添加到 url 的循环中输入单词列表以获得结果
How can I input the list of words in a loop that can be added to the url to get the results
我将 url 作为输入:url = "https://www.amazon.in/s?k=headphones&page=1"
这工作正常但在第 19 页停止
我不想在第 19 页中断,而是想将下一个输入作为“https://www.amazon.in/s?k=" +
- “演讲嘉宾&页面=1”
- “耳塞&页面=1”
依此类推 运行 循环
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title
})
return a_soup
def getnextpage(a_soup):
page= a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
page = page['href']
url = 'http://www.amazon.in'+ str(page)
return url
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)```
```output = pd.DataFrame(data)
output
这段代码返回了正确的结果,但我每次都希望它输入一个可以添加到 url 末尾的项目列表,而不是我给出一个新的 url一次一个获取可以添加到 DataFrame 的结果
注:搜索结果停在第19页
为您的关键字制作一个列表,对其进行迭代并将 while 循环包含在每次迭代中。
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k='+k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
请注意,亚马逊不喜欢这种对其页面的自动访问,并且可以很快识别访问模式。为了稍微降低请求的频率,您至少应该包括一些延迟 time.sleep()
。当然,如果用官方的api.
就更好了
例子
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title,
'url':'http://www.amazon.in' + e.h2.a['href']
})
return a_soup
def getnextpage(a_soup):
try:
page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'http://www.amazon.in'+ str(page)
except:
url = None
return url
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k='+k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
输出(打印)
http://www.amazon.in/s?k=speakers&page=2&qid=1649420352&ref=sr_pg_1
...
http://www.amazon.in/s?k=speakers&page=20&qid=1649420373&ref=sr_pg_19
http://www.amazon.in/s?k=earbuds&page=2&qid=1649420375&ref=sr_pg_1
...
http://www.amazon.in/s?k=earbuds&page=20&qid=1649420394&ref=sr_pg_19
输出(pd.DataFrame(data)
)
我将 url 作为输入:url = "https://www.amazon.in/s?k=headphones&page=1" 这工作正常但在第 19 页停止 我不想在第 19 页中断,而是想将下一个输入作为“https://www.amazon.in/s?k=" +
- “演讲嘉宾&页面=1”
- “耳塞&页面=1” 依此类推 运行 循环
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title
})
return a_soup
def getnextpage(a_soup):
page= a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
page = page['href']
url = 'http://www.amazon.in'+ str(page)
return url
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)```
```output = pd.DataFrame(data)
output
这段代码返回了正确的结果,但我每次都希望它输入一个可以添加到 url 末尾的项目列表,而不是我给出一个新的 url一次一个获取可以添加到 DataFrame 的结果 注:搜索结果停在第19页
为您的关键字制作一个列表,对其进行迭代并将 while 循环包含在每次迭代中。
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k='+k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
请注意,亚马逊不喜欢这种对其页面的自动访问,并且可以很快识别访问模式。为了稍微降低请求的频率,您至少应该包括一些延迟 time.sleep()
。当然,如果用官方的api.
例子
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title,
'url':'http://www.amazon.in' + e.h2.a['href']
})
return a_soup
def getnextpage(a_soup):
try:
page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'http://www.amazon.in'+ str(page)
except:
url = None
return url
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k='+k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
输出(打印)
http://www.amazon.in/s?k=speakers&page=2&qid=1649420352&ref=sr_pg_1
...
http://www.amazon.in/s?k=speakers&page=20&qid=1649420373&ref=sr_pg_19
http://www.amazon.in/s?k=earbuds&page=2&qid=1649420375&ref=sr_pg_1
...
http://www.amazon.in/s?k=earbuds&page=20&qid=1649420394&ref=sr_pg_19