如何使用Scrapy爬取页面二级数据
How to use Scrapy to crawl data on the second level of a Page
我想使用 scrapy 蜘蛛从以下网站的所有 post 中获取数据(问题标题 + 内容和答案):
问题是我不知道如何让它先跟随post的link,然后爬取所有15个posts/site的数据。
{导入 scrapy
class ArticleSpider(scrapy.Spider):
姓名 = "post"
start_urls = ['https://forums.att.com/t5/Data-Messaging-Features-Internet/Throttling-for-unlimited-data/m-p/4805201#M73235']
def parse(self, response):
SET_SELECTOR = 'body'
for post in response.css(SET_SELECTOR):
# Selector for title, content and answer
TITLE_SELECTOR = '.lia-message-subject h5 ::text'
CONTENT_SELECTOR = '.lia-message-body-content'
ANSWER_SELECTOR = '.lia-message-body-content'
yield {
# [0].extract() = extract_first()
'Qtitle': post.css(TITLE_SELECTOR)[0].extract(),
'Qcontent': post.css(CONTENT_SELECTOR)[0].extract(),
'Answer': post.css(ANSWER_SELECTOR)[1].extract(),
}
# Running through all 173 pages
NEXT_PAGE_SELECTOR = '.lia-paging-page-next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)}
希望你能帮帮我。提前致谢!
您需要添加一种抓取 post 内容的方法。你可以像这样重写你的爬虫代码(我使用 xpath 选择器):
# -*- coding: utf-8 -*-
import scrapy
class ArticleSpider(scrapy.Spider):
name = "post"
start_urls = ['https://forums.att.com/t5/custom/page/page-id/latest-activity/category-id/Customer_Care/page/1?page-type=latest-solutions-topics']
def parse(self, response):
for post_link in response.xpath('//h2//a/@href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_post)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[@rel="next"])[1]/@href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_post(self, response):
# Scrape title, content from post.
for post in response.xpath('//div[contains(@class, "lia-quilt-forum-message")]'):
item = dict()
item['title'] = post.xpath('.//h5/text()').extract_first()
item['content'] = post.xpath('.//div[@class="lia-message-body-content"]//text()').extract()
yield item
# If the post page has a link to next page keep parsing.
next_page = response.xpath('(//a[@rel="next"])[1]/@href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse_post)
此代码从主页解析所有 link 并调用 parse _post
方法来抓取每个 post 内容。 parse
和 parse_post
方法都检查是否有下一个 link,如果 True
继续抓取。
我想使用 scrapy 蜘蛛从以下网站的所有 post 中获取数据(问题标题 + 内容和答案):
问题是我不知道如何让它先跟随post的link,然后爬取所有15个posts/site的数据。
{导入 scrapy
class ArticleSpider(scrapy.Spider): 姓名 = "post" start_urls = ['https://forums.att.com/t5/Data-Messaging-Features-Internet/Throttling-for-unlimited-data/m-p/4805201#M73235']
def parse(self, response):
SET_SELECTOR = 'body'
for post in response.css(SET_SELECTOR):
# Selector for title, content and answer
TITLE_SELECTOR = '.lia-message-subject h5 ::text'
CONTENT_SELECTOR = '.lia-message-body-content'
ANSWER_SELECTOR = '.lia-message-body-content'
yield {
# [0].extract() = extract_first()
'Qtitle': post.css(TITLE_SELECTOR)[0].extract(),
'Qcontent': post.css(CONTENT_SELECTOR)[0].extract(),
'Answer': post.css(ANSWER_SELECTOR)[1].extract(),
}
# Running through all 173 pages
NEXT_PAGE_SELECTOR = '.lia-paging-page-next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)}
希望你能帮帮我。提前致谢!
您需要添加一种抓取 post 内容的方法。你可以像这样重写你的爬虫代码(我使用 xpath 选择器):
# -*- coding: utf-8 -*-
import scrapy
class ArticleSpider(scrapy.Spider):
name = "post"
start_urls = ['https://forums.att.com/t5/custom/page/page-id/latest-activity/category-id/Customer_Care/page/1?page-type=latest-solutions-topics']
def parse(self, response):
for post_link in response.xpath('//h2//a/@href').extract():
link = response.urljoin(post_link)
yield scrapy.Request(link, callback=self.parse_post)
# Checks if the main page has a link to next page if True keep parsing.
next_page = response.xpath('(//a[@rel="next"])[1]/@href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_post(self, response):
# Scrape title, content from post.
for post in response.xpath('//div[contains(@class, "lia-quilt-forum-message")]'):
item = dict()
item['title'] = post.xpath('.//h5/text()').extract_first()
item['content'] = post.xpath('.//div[@class="lia-message-body-content"]//text()').extract()
yield item
# If the post page has a link to next page keep parsing.
next_page = response.xpath('(//a[@rel="next"])[1]/@href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse_post)
此代码从主页解析所有 link 并调用 parse _post
方法来抓取每个 post 内容。 parse
和 parse_post
方法都检查是否有下一个 link,如果 True
继续抓取。