使用 Scrapy 输出数据时遇到问题

Trouble outputting data with Scrapy

我正在尝试从此 site 中提取有关文章的信息。我是一个 Scrapy 新手,虽然我能够得到所有正确的 URL 输出,但我有点困惑为什么我没有得到任何输出。我无法弄清楚我缺少或需要更改的内容。为此目的提供的任何帮助将不胜感激!

谢谢!!

到目前为止我有以下代码:

这是我的 蜘蛛:

import scrapy
from scrapy.http import Request

class ArticlesSpider(scrapy.Spider):
    name = 'articles'
    allowed_domains = ['artofmanliness.com']
    max_pages = 200

    def start_requests(self):
        for i in range(self.max_pages):
            yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)

    def parse(self, response):
        # AOM has a list of all articles in pages of about 189
        for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
            url = article.xpath('.//a/@href').extract()
            print(url)

            if url:
                yield Request(url=url[0], callback=self.parse_article)

    def parse_article(self, response):
        title = response.xpath('//*[@id="post-title entry-title"]/header/h1//text()').extract()
        category = response.xpath('//*[@id="in-category"]/header/p[1]//text()').extract()
        date = response.xpath('//*[@id="single-date"]/header/p[2]/span[2]//text()').extract()

        yield {
            'Title': title,
            'Category': category,
            'Date': date,
            'URL': response.url           
        }

这里是settings.py:

BOT_NAME = 'aom'    
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True

我查了HTML没有title

'//*[@id="post-title entry-title"]/header/h1//text()'

但是

'//h1[@class="post-title entry-title"]/text()'

甚至更简单

'//h1[@itemprop="headline"]/text()

可能你对其他元素也有同样的问题


编辑:

没有category

'//*[@id="in-category"]/header/p[1]//text()'

但是

'//p[@class="in-category"]//a/text()'

没有date

'//*[@id="single-date"]/header/p[2]/span[2]//text()'

但是

'//p[@class="single-date"]//span[2]/text()'

甚至更简单

'//span[@itemprop="datePublished"]/text()'

使用 CrawlerProcess() 的最小工作代码。

每个人都可以将所有代码粘贴到一个文件 script.py 和 运行 中作为 python script.py 而无需创建项目。

我用max_pages = 2只测试了几篇文章。

import scrapy
from scrapy.http import Request

class ArticlesSpider(scrapy.Spider):
    name = 'articles'
    allowed_domains = ['artofmanliness.com']
    max_pages = 2 # 200

    def start_requests(self):
        for i in range(self.max_pages):
            yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)

    def parse(self, response):
        # AOM has a list of all articles in pages of about 189
        for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
            url = article.xpath('.//a/@href').extract()
            print('article url:', url)

            if url:
                yield Request(url=url[0], callback=self.parse_article)

    def parse_article(self, response):
        #title = response.xpath('//h1[@class="post-title entry-title"]/text()').extract()
        title = response.xpath('//h1[@itemprop="headline"]/text()').extract()
        
        category = response.xpath('//p[@class="in-category"]//a/text()').extract()

        #date = response.xpath('//p[@class="single-date"]//span[2]/text()').extract()
        date = response.xpath('//span[@itemprop="datePublished"]/text()').extract()

        yield {
            'Title': title,
            'Category': category,
            'Date': date,
            'URL': response.url           
        }
        
from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    # save in file CSV, JSON or XML
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()