如何使用 BeautifulSoup 继续遍历 Python 中的下一页
How to keep iterating through next pages in Python using BeautifulSoup
以下代码包含基本解析第一页的工具。它获取所有文章,但包含下一页的 link。
如果我们看到这个网站的结构,我们可以看到下一页的 link 类似于 https://slow-communication.jp/news/?pg=2
。
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
main_url = 'https://slow-communication.jp'
req = Request(main_url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
for link in soup.findAll('a'):
_link = str(link.get('href'))
if '/news/' in _link:
artice_id = _link.split("/news/")[-1]
if len(artice_id) > 0:
print(_link)
使用这段代码,我得到
https://slow-communication.jp/news/3589/
https://slow-communication.jp/news/3575/
https://slow-communication.jp/news/3546/
https://slow-communication.jp/news/?pg=2
但我想做的是将每个 link 保留在文章中并继续翻到下一页。
所以我会保留
https://slow-communication.jp/news/3589/
https://slow-communication.jp/news/3575/
https://slow-communication.jp/news/3546/
然后转到https://slow-communication.jp/news/?pg=2
并继续做同样的事情,直到网站没有下一页。
我该怎么做?
您可以使用 for loop and range function along with format method
进行分页,这种分页比 others.You 快 2 倍 others.You 可以根据需要增加或减少页码。
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
main_url = 'https://slow-communication.jp/news/?pg={page}'
for page in range(1,11):
req = Request(main_url.format(page=page), headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
for link in soup.findAll('a'):
_link = str(link.get('href'))
if '/news/' in _link:
artice_id = _link.split("/news/")[-1]
if len(artice_id) > 0:
print(_link)
您可以设置要抓取的页数。如果没有下一页,它将 return 它找到的所有新闻文章。
import requests
from bs4 import BeautifulSoup
LINK = "https://slow-communication.jp/news/"
def get_news(link, pages=1, news=[]):
if pages == 0:
return news
res = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
if res.status_code == 200:
print("getting posts from", link)
posts, link = extract_news_and_link(res.text)
news.extend(posts)
if link:
return get_news(link, pages-1, news)
return news
else:
print("error getting news")
def extract_news_and_link(html):
soup = BeautifulSoup(html, "html.parser")
news = [post.get("href") for post in soup.select(".post-arc")]
link = soup.select_one("main > a").get("href")
if link:
return news, link
return news, None
def main():
news = get_news(LINK, 10)
print("Posts:")
for post in news:
print(post)
if __name__ == "__main__":
main()
如果没有下一个可用站点,您可以使用 while loop
移动到每个下一个站点和 break
:
while True:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
###perform some action
if soup.select_one('a[href*="?pg="]'):
url = soup.select_one('a[href*="?pg="]')['href']
print(url)
else:
break
您还可以收集一些数据并将其以结构化方式存储在全局列表中:
for a in soup.select('a.post-arc'):
data.append({
'title':a.h2.text,
'url':a['href']
})
例子
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
main_url = 'https://slow-communication.jp'
url = main_url
data = []
while True:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
for a in soup.select('a.post-arc'):
data.append({
'title':a.h2.text,
'url':a['href']
})
if soup.select_one('a[href*="?pg="]'):
url = soup.select_one('a[href*="?pg="]')['href']
print(url)
else:
break
pd.DataFrame(data)
输出
title
url
0
都立高校から「ブラック校則」が なくなる
https://slow-communication.jp/news/3589/
1
北京パラリンピックが おわった
https://slow-communication.jp/news/3575/
2
「優生保護法で手術された人に 国は おわびのお金を払え」という判決が出た
https://slow-communication.jp/news/3546/
3
ロシアが ウクライナを 攻撃している
https://slow-communication.jp/news/3535/
4
東京都が「同性パートナーシップ制度」を作る
https://slow-communication.jp/news/3517/
以下代码包含基本解析第一页的工具。它获取所有文章,但包含下一页的 link。
如果我们看到这个网站的结构,我们可以看到下一页的 link 类似于 https://slow-communication.jp/news/?pg=2
。
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
main_url = 'https://slow-communication.jp'
req = Request(main_url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
for link in soup.findAll('a'):
_link = str(link.get('href'))
if '/news/' in _link:
artice_id = _link.split("/news/")[-1]
if len(artice_id) > 0:
print(_link)
使用这段代码,我得到
https://slow-communication.jp/news/3589/
https://slow-communication.jp/news/3575/
https://slow-communication.jp/news/3546/
https://slow-communication.jp/news/?pg=2
但我想做的是将每个 link 保留在文章中并继续翻到下一页。 所以我会保留
https://slow-communication.jp/news/3589/
https://slow-communication.jp/news/3575/
https://slow-communication.jp/news/3546/
然后转到https://slow-communication.jp/news/?pg=2
并继续做同样的事情,直到网站没有下一页。
我该怎么做?
您可以使用 for loop and range function along with format method
进行分页,这种分页比 others.You 快 2 倍 others.You 可以根据需要增加或减少页码。
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
main_url = 'https://slow-communication.jp/news/?pg={page}'
for page in range(1,11):
req = Request(main_url.format(page=page), headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
for link in soup.findAll('a'):
_link = str(link.get('href'))
if '/news/' in _link:
artice_id = _link.split("/news/")[-1]
if len(artice_id) > 0:
print(_link)
您可以设置要抓取的页数。如果没有下一页,它将 return 它找到的所有新闻文章。
import requests
from bs4 import BeautifulSoup
LINK = "https://slow-communication.jp/news/"
def get_news(link, pages=1, news=[]):
if pages == 0:
return news
res = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
if res.status_code == 200:
print("getting posts from", link)
posts, link = extract_news_and_link(res.text)
news.extend(posts)
if link:
return get_news(link, pages-1, news)
return news
else:
print("error getting news")
def extract_news_and_link(html):
soup = BeautifulSoup(html, "html.parser")
news = [post.get("href") for post in soup.select(".post-arc")]
link = soup.select_one("main > a").get("href")
if link:
return news, link
return news, None
def main():
news = get_news(LINK, 10)
print("Posts:")
for post in news:
print(post)
if __name__ == "__main__":
main()
如果没有下一个可用站点,您可以使用 while loop
移动到每个下一个站点和 break
:
while True:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
###perform some action
if soup.select_one('a[href*="?pg="]'):
url = soup.select_one('a[href*="?pg="]')['href']
print(url)
else:
break
您还可以收集一些数据并将其以结构化方式存储在全局列表中:
for a in soup.select('a.post-arc'):
data.append({
'title':a.h2.text,
'url':a['href']
})
例子
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
main_url = 'https://slow-communication.jp'
url = main_url
data = []
while True:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, "lxml")
for a in soup.select('a.post-arc'):
data.append({
'title':a.h2.text,
'url':a['href']
})
if soup.select_one('a[href*="?pg="]'):
url = soup.select_one('a[href*="?pg="]')['href']
print(url)
else:
break
pd.DataFrame(data)
输出
title | url | |
---|---|---|
0 | 都立高校から「ブラック校則」が なくなる | https://slow-communication.jp/news/3589/ |
1 | 北京パラリンピックが おわった | https://slow-communication.jp/news/3575/ |
2 | 「優生保護法で手術された人に 国は おわびのお金を払え」という判決が出た | https://slow-communication.jp/news/3546/ |
3 | ロシアが ウクライナを 攻撃している | https://slow-communication.jp/news/3535/ |
4 | 東京都が「同性パートナーシップ制度」を作る | https://slow-communication.jp/news/3517/ |