Python Scrapy Spider 没有跟随正确 Link
Python Scrapy Spider Not Following Correct Link
我正在尝试从 this post 中抓取数据。但是,我在抓取评论时遇到了问题。评论的分页由url末尾的“page=1”决定。我注意到如果使用“page=0”,它会将所有评论加载到一页上,这非常好。然而,无论如何,我的 scrapy 脚本只会从第一页抓取评论。即使我将 link 更改为“page=2”,它仍然只会从第一页抓取评论。我不明白为什么会出现这个问题。
import scrapy
from scrapy.crawler import CrawlerProcess
class IdeaSpider(scrapy.Spider):
name = "IdeaSpider"
def start_requests(self):
yield scrapy.Request(
url="https://www.games2gether.com/amplitude-studios/endless-space-2/ideas/1850-force-infinite-actions-to"
"-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
# parses title, post, status, author, date
def parse_idea(self, response):
post_author = response.xpath('//span[@class = "username-content"]/text()')
temp_list.append(post_author.extract_first())
post_categories = response.xpath('//a[@class = "list-tags-item ng-star-inserted"]/text()')
post_categories_ext = post_categories.extract()
if len(post_categories_ext) > 1:
post_categories_combined = ""
for category in post_categories_ext:
post_categories_combined = post_categories_combined + category + ", "
temp_list.append(post_categories_combined)
else:
temp_list.append(post_categories_ext[0])
post_date = response.xpath('//div[@class = "time-date"]/text()')
temp_list.append(post_date.extract_first())
post_title = response.xpath('//h1[@class = "title"]/text()')
temp_list.append(post_title.extract()[0])
post_body = response.xpath('//article[@class = "post-list-item clearfix ng-star-inserted"]//div[@class = '
'"post-list-item-message-content post-content ng-star-inserted"]//text()')
post_body_ext = post_body.extract()
if len(post_body_ext) > 1:
post_body_combined = ""
for text in post_body_ext:
post_body_combined = post_body_combined + " " + text
temp_list.append(post_body_combined)
else:
temp_list.append(post_body_ext[0])
post_status = response.xpath('//p[@class = "status-title"][1]/text()')
if len(post_status.extract()) != 0:
temp_list.append(post_status.extract()[0])
else:
temp_list.append("no status")
dev_name = response.xpath('//div[@class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[@class '
'= "username user-role-username"]/text()')
temp_list.append(dev_name.extract_first())
dev_comment = response.xpath('//div[@class = "message post-content ng-star-inserted"]/p/text()')
temp_list.append(dev_comment.extract_first())
c_author_index = 0
c_body_index = 0
c_author_path = response.xpath('//article[@class = "post-list-item clearfix two-columns '
'ng-star-inserted"]//span[@class = "username-content"]/text()')
while c_author_index < len(c_author_path):
comment_author = c_author_path[c_author_index]
temp_list.append(comment_author.extract())
c_author_index += 1
c_body_combined = ""
c_body_path = '//div[@class = "post-list-comments"]/g2g-comments-item[1]/article[@class = ' \
'"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[@class ' \
'="post-list-item-message-content post-content ng-star-inserted"]//text() '
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
elif len(c_body_list) != 0:
temp_list.append(c_body_list[0])
c_body_index += 1
elif len(c_body_list) == 0:
c_body_index += 1
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
temp_list = list()
all_post_data = list()
process = CrawlerProcess()
process.crawl(IdeaSpider)
process.start()
print(temp_list)
这是因为评论页面是使用 JavaScript 加载的,而 Scrapy 没有渲染 JavaScript。你可以使用 Splash.
我正在尝试从 this post 中抓取数据。但是,我在抓取评论时遇到了问题。评论的分页由url末尾的“page=1”决定。我注意到如果使用“page=0”,它会将所有评论加载到一页上,这非常好。然而,无论如何,我的 scrapy 脚本只会从第一页抓取评论。即使我将 link 更改为“page=2”,它仍然只会从第一页抓取评论。我不明白为什么会出现这个问题。
import scrapy
from scrapy.crawler import CrawlerProcess
class IdeaSpider(scrapy.Spider):
name = "IdeaSpider"
def start_requests(self):
yield scrapy.Request(
url="https://www.games2gether.com/amplitude-studios/endless-space-2/ideas/1850-force-infinite-actions-to"
"-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
# parses title, post, status, author, date
def parse_idea(self, response):
post_author = response.xpath('//span[@class = "username-content"]/text()')
temp_list.append(post_author.extract_first())
post_categories = response.xpath('//a[@class = "list-tags-item ng-star-inserted"]/text()')
post_categories_ext = post_categories.extract()
if len(post_categories_ext) > 1:
post_categories_combined = ""
for category in post_categories_ext:
post_categories_combined = post_categories_combined + category + ", "
temp_list.append(post_categories_combined)
else:
temp_list.append(post_categories_ext[0])
post_date = response.xpath('//div[@class = "time-date"]/text()')
temp_list.append(post_date.extract_first())
post_title = response.xpath('//h1[@class = "title"]/text()')
temp_list.append(post_title.extract()[0])
post_body = response.xpath('//article[@class = "post-list-item clearfix ng-star-inserted"]//div[@class = '
'"post-list-item-message-content post-content ng-star-inserted"]//text()')
post_body_ext = post_body.extract()
if len(post_body_ext) > 1:
post_body_combined = ""
for text in post_body_ext:
post_body_combined = post_body_combined + " " + text
temp_list.append(post_body_combined)
else:
temp_list.append(post_body_ext[0])
post_status = response.xpath('//p[@class = "status-title"][1]/text()')
if len(post_status.extract()) != 0:
temp_list.append(post_status.extract()[0])
else:
temp_list.append("no status")
dev_name = response.xpath('//div[@class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[@class '
'= "username user-role-username"]/text()')
temp_list.append(dev_name.extract_first())
dev_comment = response.xpath('//div[@class = "message post-content ng-star-inserted"]/p/text()')
temp_list.append(dev_comment.extract_first())
c_author_index = 0
c_body_index = 0
c_author_path = response.xpath('//article[@class = "post-list-item clearfix two-columns '
'ng-star-inserted"]//span[@class = "username-content"]/text()')
while c_author_index < len(c_author_path):
comment_author = c_author_path[c_author_index]
temp_list.append(comment_author.extract())
c_author_index += 1
c_body_combined = ""
c_body_path = '//div[@class = "post-list-comments"]/g2g-comments-item[1]/article[@class = ' \
'"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[@class ' \
'="post-list-item-message-content post-content ng-star-inserted"]//text() '
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
elif len(c_body_list) != 0:
temp_list.append(c_body_list[0])
c_body_index += 1
elif len(c_body_list) == 0:
c_body_index += 1
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
temp_list = list()
all_post_data = list()
process = CrawlerProcess()
process.crawl(IdeaSpider)
process.start()
print(temp_list)
这是因为评论页面是使用 JavaScript 加载的,而 Scrapy 没有渲染 JavaScript。你可以使用 Splash.