Scrapy 正在爬取,但是没有输出
Scrapy is crawling, but no output
我正在抓取一些没有错误的页面,但抓取工具没有生成任何输出。
函数 parse_article
工作正常(我单独测试过),但与 parse
函数一起,它不再创建任何输出。有任何想法吗?
我是 运行 命令行爬虫:scrapy crawl all_articles_from_one_page -o test_file.csv
import scrapy
from scrapping_538.items import Scrapping538Item
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
import datetime
import socket
class BasicSpider(scrapy.Spider):
name = 'all_articles_from_one_page'
allowed_domains = ['web']
start_urls = ('http://fivethirtyeight.com/features/',)
def parse(self, response):
# iterate through articles
article_divs = response.xpath('//*[@id="primary"]//div[contains(@id, "post")]')
for article in article_divs:
print('\n**********************************************')
article_link = article.xpath('.//h2/a/@href').extract()[0]
print('------article link: ' + str(article_link))
yield scrapy.Request(article_link, callback=self.parse_article)
def parse_article(self, response):
il = ItemLoader(item=Scrapping538Item(), response=response)
il.add_css('title', 'h1.article-title::text')
il.add_css('date', 'time.datetime::text')
il.add_css('author', '.author::text')
il.add_css('filed_under', '.term::text')
il.add_css('article_text', '.entry-content *::text')
il.add_value('url', response.url)
il.add_value('project', self.settings.get('BOT_NAME'))
il.add_value('spider', self.name)
il.add_value('server', socket.gethostname())
il.add_value('date_import', datetime.datetime.now())
return il.load_item()
将您的 allowed_domains
更改为:
allowed_domains = ['fivethirtyeight.com']
Scrapy 将过滤对未在 属性 上列出的域的任何请求。包括 fivethirtyeight.com
到它。
(https://docs.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.allowed_domains)
allowed_domains
An optional list of strings containing domains that this spider is allowed to crawl. Requests for URLs not belonging to the domain names specified in this list (or their subdomains) won’t be followed if OffsiteMiddleware
is enabled.
Let’s say your target url is https://www.example.com/1.html
, then add 'example.com'
to the list.
我正在抓取一些没有错误的页面,但抓取工具没有生成任何输出。
函数 parse_article
工作正常(我单独测试过),但与 parse
函数一起,它不再创建任何输出。有任何想法吗?
我是 运行 命令行爬虫:scrapy crawl all_articles_from_one_page -o test_file.csv
import scrapy
from scrapping_538.items import Scrapping538Item
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose
import datetime
import socket
class BasicSpider(scrapy.Spider):
name = 'all_articles_from_one_page'
allowed_domains = ['web']
start_urls = ('http://fivethirtyeight.com/features/',)
def parse(self, response):
# iterate through articles
article_divs = response.xpath('//*[@id="primary"]//div[contains(@id, "post")]')
for article in article_divs:
print('\n**********************************************')
article_link = article.xpath('.//h2/a/@href').extract()[0]
print('------article link: ' + str(article_link))
yield scrapy.Request(article_link, callback=self.parse_article)
def parse_article(self, response):
il = ItemLoader(item=Scrapping538Item(), response=response)
il.add_css('title', 'h1.article-title::text')
il.add_css('date', 'time.datetime::text')
il.add_css('author', '.author::text')
il.add_css('filed_under', '.term::text')
il.add_css('article_text', '.entry-content *::text')
il.add_value('url', response.url)
il.add_value('project', self.settings.get('BOT_NAME'))
il.add_value('spider', self.name)
il.add_value('server', socket.gethostname())
il.add_value('date_import', datetime.datetime.now())
return il.load_item()
将您的 allowed_domains
更改为:
allowed_domains = ['fivethirtyeight.com']
Scrapy 将过滤对未在 属性 上列出的域的任何请求。包括 fivethirtyeight.com
到它。
(https://docs.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.allowed_domains)
allowed_domains
An optional list of strings containing domains that this spider is allowed to crawl. Requests for URLs not belonging to the domain names specified in this list (or their subdomains) won’t be followed if
OffsiteMiddleware
is enabled.Let’s say your target url is
https://www.example.com/1.html
, then add'example.com'
to the list.