管道文件存在问题,因为它未获取书名。而是为每次抓取保存一张 None.jpg 的随机图像
There is a problem in pipelines file as it is not fetching the book name. instead saving one random image with None.jpg for each crawl
items.py 文件。因为我知道 image_urls 和图像字段。它没有造成任何问题。
import scrapy
from scrapy.loader.processors import TakeFirst
class BooksToScrapeItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
output_processor = TakeFirst()
)
pipelines.py 文件。我认为 get_media_request 方法一定有问题,因为它没有从项目文件
中获取 book_name
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class BooksToScrapeImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x,meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
def file_path(self, request, response=None, info=None):
return 'full/%s.jpg' % (request.meta['bookname'])
我为 scraping.It 使用的蜘蛛文件在我没有自定义管道文件时工作。
import scrapy
from scrapy.loader import ItemLoader
from books_to_scrape.items import BooksToScrapeItem
class ImgscrapeSpider(scrapy.Spider):
name = 'imgscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
abs_url = response.urljoin(relative_url)
loader.add_value('image_urls',abs_url)
loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()")
yield loader.load_item()
你的问题出在相对 xpath 中
loader.add_xpath('book_name', ".//article[@class='product_pod']/h3/a/text()")
加载程序使用 xpath("//article[@class='product_pod']")
作为选择器
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(), selector=article)
所以所有的相对 xpath 都是相对于 "//article[@class='product_pod']"
并且它们在 xpath 中不需要 "//article[@class='product_pod']"
。
使用相对 xpath ".//article[@class='product_pod']/h3/a/text()"
找不到标题,因此 book_name
对于所有项目都是空的,对于所有使用 None
作为标题的项目 - 并且它使用相同的名称None.jpg
所有图像。
必须
loader.add_xpath('book_name', ".//h3/a/text()") # title with `...`
顺便说一句: text()
没有完整的标题,但有 ...
。要获得完整的标题,您必须获得属性 title=
loader.add_xpath('book_name', ".//h3/a/@title") # full title
我在一个文件中创建了所有代码的版本 运行 它没有创建项目。
每个人都可以将它复制到单个文件并运行 测试它。
import scrapy
from scrapy.loader.processors import TakeFirst
class BooksToScrapeItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
output_processor = TakeFirst()
)
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
class BooksToScrapeImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x, meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
def file_path(self, request, response=None, info=None):
return 'full/%s.jpg' % request.meta['bookname']
from scrapy.loader import ItemLoader
class ImgscrapeSpider(scrapy.Spider):
name = 'imgscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
abs_url = response.urljoin(relative_url)
loader.add_value('image_urls', abs_url)
#loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()") # wrong relative xpath
#loader.add_xpath('book_name', ".//h3/a/text()") # only partial title
loader.add_xpath('book_name', ".//h3/a/@title") # full title
yield loader.load_item()
# -----------------------------------------------------------------------------
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
# it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work
'ITEM_PIPELINES': {'__main__.BooksToScrapeImagePipeline': 1}, # used Pipeline create in current file (needs __main___)
'IMAGES_STORE': '.', # this folder has to exist before downloading
})
c.crawl(ImgscrapeSpider)
c.start()
items.py 文件。因为我知道 image_urls 和图像字段。它没有造成任何问题。
import scrapy
from scrapy.loader.processors import TakeFirst
class BooksToScrapeItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
output_processor = TakeFirst()
)
pipelines.py 文件。我认为 get_media_request 方法一定有问题,因为它没有从项目文件
中获取 book_namefrom scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class BooksToScrapeImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x,meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
def file_path(self, request, response=None, info=None):
return 'full/%s.jpg' % (request.meta['bookname'])
我为 scraping.It 使用的蜘蛛文件在我没有自定义管道文件时工作。
import scrapy
from scrapy.loader import ItemLoader
from books_to_scrape.items import BooksToScrapeItem
class ImgscrapeSpider(scrapy.Spider):
name = 'imgscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
abs_url = response.urljoin(relative_url)
loader.add_value('image_urls',abs_url)
loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()")
yield loader.load_item()
你的问题出在相对 xpath 中
loader.add_xpath('book_name', ".//article[@class='product_pod']/h3/a/text()")
加载程序使用 xpath("//article[@class='product_pod']")
作为选择器
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(), selector=article)
所以所有的相对 xpath 都是相对于 "//article[@class='product_pod']"
并且它们在 xpath 中不需要 "//article[@class='product_pod']"
。
使用相对 xpath ".//article[@class='product_pod']/h3/a/text()"
找不到标题,因此 book_name
对于所有项目都是空的,对于所有使用 None
作为标题的项目 - 并且它使用相同的名称None.jpg
所有图像。
必须
loader.add_xpath('book_name', ".//h3/a/text()") # title with `...`
顺便说一句: text()
没有完整的标题,但有 ...
。要获得完整的标题,您必须获得属性 title=
loader.add_xpath('book_name', ".//h3/a/@title") # full title
我在一个文件中创建了所有代码的版本 运行 它没有创建项目。
每个人都可以将它复制到单个文件并运行 测试它。
import scrapy
from scrapy.loader.processors import TakeFirst
class BooksToScrapeItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
output_processor = TakeFirst()
)
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
class BooksToScrapeImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x, meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
def file_path(self, request, response=None, info=None):
return 'full/%s.jpg' % request.meta['bookname']
from scrapy.loader import ItemLoader
class ImgscrapeSpider(scrapy.Spider):
name = 'imgscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
abs_url = response.urljoin(relative_url)
loader.add_value('image_urls', abs_url)
#loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()") # wrong relative xpath
#loader.add_xpath('book_name', ".//h3/a/text()") # only partial title
loader.add_xpath('book_name', ".//h3/a/@title") # full title
yield loader.load_item()
# -----------------------------------------------------------------------------
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
# it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work
'ITEM_PIPELINES': {'__main__.BooksToScrapeImagePipeline': 1}, # used Pipeline create in current file (needs __main___)
'IMAGES_STORE': '.', # this folder has to exist before downloading
})
c.crawl(ImgscrapeSpider)
c.start()