使用 Scrapy 输出数据时遇到问题
Trouble outputting data with Scrapy
我正在尝试从此 site 中提取有关文章的信息。我是一个 Scrapy 新手,虽然我能够得到所有正确的 URL 输出,但我有点困惑为什么我没有得到任何输出。我无法弄清楚我缺少或需要更改的内容。为此目的提供的任何帮助将不胜感激!
谢谢!!
到目前为止我有以下代码:
这是我的 蜘蛛:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
url = article.xpath('.//a/@href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[@id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[@id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[@id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
这里是settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
我查了HTML没有title
'//*[@id="post-title entry-title"]/header/h1//text()'
但是
'//h1[@class="post-title entry-title"]/text()'
甚至更简单
'//h1[@itemprop="headline"]/text()
可能你对其他元素也有同样的问题
编辑:
没有category
'//*[@id="in-category"]/header/p[1]//text()'
但是
'//p[@class="in-category"]//a/text()'
没有date
'//*[@id="single-date"]/header/p[2]/span[2]//text()'
但是
'//p[@class="single-date"]//span[2]/text()'
甚至更简单
'//span[@itemprop="datePublished"]/text()'
使用 CrawlerProcess()
的最小工作代码。
每个人都可以将所有代码粘贴到一个文件 script.py
和 运行 中作为 python script.py
而无需创建项目。
我用max_pages = 2
只测试了几篇文章。
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
url = article.xpath('.//a/@href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[@class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[@itemprop="headline"]/text()').extract()
category = response.xpath('//p[@class="in-category"]//a/text()').extract()
#date = response.xpath('//p[@class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[@itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()
我正在尝试从此 site 中提取有关文章的信息。我是一个 Scrapy 新手,虽然我能够得到所有正确的 URL 输出,但我有点困惑为什么我没有得到任何输出。我无法弄清楚我缺少或需要更改的内容。为此目的提供的任何帮助将不胜感激!
谢谢!!
到目前为止我有以下代码:
这是我的 蜘蛛:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
url = article.xpath('.//a/@href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[@id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[@id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[@id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
这里是settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
我查了HTML没有title
'//*[@id="post-title entry-title"]/header/h1//text()'
但是
'//h1[@class="post-title entry-title"]/text()'
甚至更简单
'//h1[@itemprop="headline"]/text()
可能你对其他元素也有同样的问题
编辑:
没有category
'//*[@id="in-category"]/header/p[1]//text()'
但是
'//p[@class="in-category"]//a/text()'
没有date
'//*[@id="single-date"]/header/p[2]/span[2]//text()'
但是
'//p[@class="single-date"]//span[2]/text()'
甚至更简单
'//span[@itemprop="datePublished"]/text()'
使用 CrawlerProcess()
的最小工作代码。
每个人都可以将所有代码粘贴到一个文件 script.py
和 运行 中作为 python script.py
而无需创建项目。
我用max_pages = 2
只测试了几篇文章。
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
url = article.xpath('.//a/@href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[@class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[@itemprop="headline"]/text()').extract()
category = response.xpath('//p[@class="in-category"]//a/text()').extract()
#date = response.xpath('//p[@class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[@itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()