使用 scrapy 转到 showthread.php 的下一页
Go to next page on showthread.php with scrapy
我是 scrapy 的新手。在大约 4 天的时间里,我在获取 showthread.php(基于 vbulletin 的论坛)时一直停留在转到下一页。
我的目标:http://forum.femaledaily.com/showthread.php?359-Hair-Smoothing
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from femaledaily.items import FemaledailyItem
class Femaledaily(scrapy.Spider):
name = "femaledaily"
allowed_domains = ["femaledaily.com"]
start_urls = [
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4",
]
def parse(self, response):
for thd in response.css("tbody > tr "):
print "==========NEW THREAD======"
url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract()
url[0] = "http://forum.femaledaily.com/"+url[0]
print url[0]
yield scrapy.Request(url[0], callback=self.parse_thread)
def parse_thread(self, response):
for page in response.xpath('//ol[@id="posts"]/li'):
item = FemaledailyItem()
item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract()
# item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first()
post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract()
if not post_creator:
item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract()
else:
item['post_creator'] = post_creator
item['post_content'] = ""
cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract()
for ct in cot:
item['post_content'] += ct.replace('\t','').replace('\n','')
yield item
我能够获得每个主题的前 10 个帖子,但我不知道如何转到下一页。有什么想法吗?
对您的代码稍作更改,以便正确分页,
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from femaledaily.items import FemaledailyItem
class Femaledaily(scrapy.Spider):
name = "femaledaily"
allowed_domains = ["femaledaily.com"]
BASE_URL = "http://forum.femaledaily.com/"
start_urls = [
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4",
]
def parse(self, response):
for thd in response.css("tbody > tr "):
print "==========NEW THREAD======"
url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract()
url = "http://forum.femaledaily.com/"+url[0]
yield scrapy.Request(url, callback=self.parse_thread)
# pagination
next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract()
if next_page:
yield Request(self.BASE_URL + next_page[0], callback=self.parse)
else:
return
def parse_thread(self, response):
for page in response.xpath('//ol[@id="posts"]/li'):
item = FemaledailyItem()
item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract()
# item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first()
post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract()
if not post_creator:
item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract()
else:
item['post_creator'] = post_creator
item['post_content'] = ""
cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract()
for ct in cot:
item['post_content'] += ct.replace('\t','').replace('\n','')
yield item
# pagination
next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract()
if next_page:
yield Request(self.BASE_URL + next_page[0], callback=self.parse_thread)
else:
return
这里首先提取下一页的link(即单向前箭头)并向next_page_url
发出请求,并使回调函数与调用它的函数相同。当它到达最后一页时,next-page-url
消失并停止。
我是 scrapy 的新手。在大约 4 天的时间里,我在获取 showthread.php(基于 vbulletin 的论坛)时一直停留在转到下一页。
我的目标:http://forum.femaledaily.com/showthread.php?359-Hair-Smoothing
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from femaledaily.items import FemaledailyItem
class Femaledaily(scrapy.Spider):
name = "femaledaily"
allowed_domains = ["femaledaily.com"]
start_urls = [
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4",
]
def parse(self, response):
for thd in response.css("tbody > tr "):
print "==========NEW THREAD======"
url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract()
url[0] = "http://forum.femaledaily.com/"+url[0]
print url[0]
yield scrapy.Request(url[0], callback=self.parse_thread)
def parse_thread(self, response):
for page in response.xpath('//ol[@id="posts"]/li'):
item = FemaledailyItem()
item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract()
# item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first()
post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract()
if not post_creator:
item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract()
else:
item['post_creator'] = post_creator
item['post_content'] = ""
cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract()
for ct in cot:
item['post_content'] += ct.replace('\t','').replace('\n','')
yield item
我能够获得每个主题的前 10 个帖子,但我不知道如何转到下一页。有什么想法吗?
对您的代码稍作更改,以便正确分页,
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from femaledaily.items import FemaledailyItem
class Femaledaily(scrapy.Spider):
name = "femaledaily"
allowed_domains = ["femaledaily.com"]
BASE_URL = "http://forum.femaledaily.com/"
start_urls = [
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3",
"http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4",
]
def parse(self, response):
for thd in response.css("tbody > tr "):
print "==========NEW THREAD======"
url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract()
url = "http://forum.femaledaily.com/"+url[0]
yield scrapy.Request(url, callback=self.parse_thread)
# pagination
next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract()
if next_page:
yield Request(self.BASE_URL + next_page[0], callback=self.parse)
else:
return
def parse_thread(self, response):
for page in response.xpath('//ol[@id="posts"]/li'):
item = FemaledailyItem()
item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract()
# item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first()
post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract()
if not post_creator:
item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract()
else:
item['post_creator'] = post_creator
item['post_content'] = ""
cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract()
for ct in cot:
item['post_content'] += ct.replace('\t','').replace('\n','')
yield item
# pagination
next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract()
if next_page:
yield Request(self.BASE_URL + next_page[0], callback=self.parse_thread)
else:
return
这里首先提取下一页的link(即单向前箭头)并向next_page_url
发出请求,并使回调函数与调用它的函数相同。当它到达最后一页时,next-page-url
消失并停止。