用 scrapy 解释回调和 cb_kwargs
Interpreting callbacks and cb_kwargs with scrapy
我即将达到 scrapy
的个人里程碑。目的是正确理解 callback
和 cb_kwargs
,我已经无数次阅读文档,但我通过视觉代码、实践和解释学得最好。
我有一个示例爬虫,目的是抓取书名、价格并进入每个书页并提取单条信息。我也在尝试了解如何在接下来的几页中正确获取信息,我知道这取决于对回调操作的理解。
当我 运行 我的脚本 returns 只为第一页产生结果时,我如何获得额外的页面?
这是我的刮板:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[@class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[@class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[@class="price_color"]//text()')
for url in [books.xpath('.//a//@href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[@class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
我认为问题出在我尝试抓取接下来几页的部分。我尝试了使用以下方法的替代方法:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
但是,我用这种方法得到了以下错误:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
应该是start_requests
,self.start_urls
(函数内部)。
get()
会return第一个结果,你想要的是getall()
为了return一个列表。
“next_page”部分不需要 for 循环,这不是错误,只是不必要。
在行 for url in books.xpath
中,你得到每个 url 两次,同样不是错误,但仍然...
这里data = response.xpath('//div[@class = "col-sm-8 col-md-9"]')
你不用select一本一本的书,你select整个书容器,你可以检查len(data.getall()) == 1
.
book_quote = response.xpath('//p[@class="instock availability"]//text()').get()
将 return \n
,查看源代码以找出原因(提示:'i' 标签)。
将您的代码与此进行比较,看看我更改了什么:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[@class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[@class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[@class="price_color"]//text()')
for url in books.xpath('.//h3/a//@href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[@class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(@class, "product_main")]//p[@class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()
我即将达到 scrapy
的个人里程碑。目的是正确理解 callback
和 cb_kwargs
,我已经无数次阅读文档,但我通过视觉代码、实践和解释学得最好。
我有一个示例爬虫,目的是抓取书名、价格并进入每个书页并提取单条信息。我也在尝试了解如何在接下来的几页中正确获取信息,我知道这取决于对回调操作的理解。
当我 运行 我的脚本 returns 只为第一页产生结果时,我如何获得额外的页面?
这是我的刮板:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[@class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[@class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[@class="price_color"]//text()')
for url in [books.xpath('.//a//@href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[@class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
我认为问题出在我尝试抓取接下来几页的部分。我尝试了使用以下方法的替代方法:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
但是,我用这种方法得到了以下错误:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
应该是
start_requests
,self.start_urls
(函数内部)。get()
会return第一个结果,你想要的是getall()
为了return一个列表。“next_page”部分不需要 for 循环,这不是错误,只是不必要。
在行
for url in books.xpath
中,你得到每个 url 两次,同样不是错误,但仍然...这里
data = response.xpath('//div[@class = "col-sm-8 col-md-9"]')
你不用select一本一本的书,你select整个书容器,你可以检查len(data.getall()) == 1
.book_quote = response.xpath('//p[@class="instock availability"]//text()').get()
将 return\n
,查看源代码以找出原因(提示:'i' 标签)。
将您的代码与此进行比较,看看我更改了什么:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[@class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[@class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[@class="price_color"]//text()')
for url in books.xpath('.//h3/a//@href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[@class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(@class, "product_main")]//p[@class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()