Python HTML 解析器分页

Question

我是 python 的新手，尝试使用 HTML 解析器已经走到这一步了，但我仍然不知道如何为页面底部的评论分页为网站工作。

URL 在 PasteBin 代码中，出于隐私原因，我在该线程中省略了 URL。

非常感谢任何帮助。

# Reviews Scrape

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'EXAMPLE.COM'

# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# HTML Parsing
page_soup = soup(page_html, "html.parser")

# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})

filename = "compreviews.csv"
f = open(filename, "w")

headers = "Score, Title, Content\n"

f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
    # score = container.div.div.span["data-score"]
    score = container.findAll("span",{"data-score":True})
    user_score = score[0].text.strip()

    title_review = container.findAll("b",{"class":"jdgm-rev__title"})
    user_title = title_review[0].text.strip()

    content_review = container.findAll("div",{"class":"jdgm-rev__body"})
    user_content = content_review[0].text.strip()

    print("user_score:" + score[0]['data-score'])
    print("user_title:" + user_title)
    print("user_content:" + user_content)

    f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")

f.close()

Answer 1

该页面使用查询字符串执行 xhr GET 请求以获取结果。此查询字符串包含每页评论和页码的参数。您可以提出一个初始请求，每页的最大评论数似乎为 31，从返回的 json 中提取 html，然后获取页数；在获得结果的所有页面上写一个循环运行。下面的示例构造：

import requests
from bs4 import BeautifulSoup as bs

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    print([i.text for i in soup.select('.jdgm-rev__author')])
    print([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        print([i.text for i in soup.select('.jdgm-rev__author')])
        print([i.text for i in soup.select('.jdgm-rev__title')]) #etc

示例数据帧到 csv

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

authors = []
titles = []

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
    titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
        titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc

headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )

Python HTML 解析器分页

Python HTML Parser Pagination

python

beautifulsoup

html-parsing

web-scraping

python-3.x