Python Scrapy - yield 不起作用但 print() 起作用
Python Scrapy - yield not working but print() does
我正在尝试抓取网站并计算每个页面上关键字的出现次数。
Modifying code from this article
使用 print() 至少会在 运行 爬虫这样的时候输出结果:
scrapy crawl webcrawler > output.csv
但是,output.csv 格式不正确。我应该使用 yield(或 return),但是在那种情况下,输出的 CSV/JSON 是空白的。
这是我的爬虫代码
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item, Field
import re
from ..items import keyword
df = pd.read_csv(r'python\scrapy-1\web_scraping_websites.csv', index_col=0)
class WebsiteSpider(CrawlSpider):
name = "webcrawler"
custom_settings = {
'DEPTH_LIMIT': 1,
}
allowed_domains = df.domain.head(5).to_list()
start_urls = df.url.head(5).to_list()
rules = [Rule(LinkExtractor(), follow=False, callback="parse")]
def parse(self, response):
terms = ['canidae', 'felidae', 'cat', 'cattle', 'dog', 'donkey', 'goat', 'guinea pig', 'horse', 'pig', 'rabbit']
response_body = response.body.decode('utf-8')
url = response.url
domain = url.split('/')[2]
results = []
for x in terms:
t = re.findall(x, response_body, re.IGNORECASE)
result2 = keyword()
result2["url"] = url
result2["domain"] = domain
result2["term"] = x
result2["matches"] = len(t)
results.append(result2)
print(results) #print sort of works
# yield results #Does not work...
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
通过更仔细地重写 parse
方法解决了这个问题。
博客 post 提供了基本思想:为您需要的每个关键字循环响应正文。但不是使用 for 循环,而是使用列表推导式来构建匹配列表与 yield
配合得很好
def parse(self, response):
#yield the response url
#and the keyword matches...
response_body = response.body.decode('utf-8')
url = response.url
domain = url.split('/')[2]
item = SfscrapeItem()
item['url'] = url
item['domain'] = domain
item['status'] = response.status
item['matches'] = [str(len(re.findall(keyword, response_body, re.IGNORECASE))) for keyword in keywords]
yield item
我正在尝试抓取网站并计算每个页面上关键字的出现次数。
Modifying code from this article
使用 print() 至少会在 运行 爬虫这样的时候输出结果:
scrapy crawl webcrawler > output.csv
但是,output.csv 格式不正确。我应该使用 yield(或 return),但是在那种情况下,输出的 CSV/JSON 是空白的。
这是我的爬虫代码
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item, Field
import re
from ..items import keyword
df = pd.read_csv(r'python\scrapy-1\web_scraping_websites.csv', index_col=0)
class WebsiteSpider(CrawlSpider):
name = "webcrawler"
custom_settings = {
'DEPTH_LIMIT': 1,
}
allowed_domains = df.domain.head(5).to_list()
start_urls = df.url.head(5).to_list()
rules = [Rule(LinkExtractor(), follow=False, callback="parse")]
def parse(self, response):
terms = ['canidae', 'felidae', 'cat', 'cattle', 'dog', 'donkey', 'goat', 'guinea pig', 'horse', 'pig', 'rabbit']
response_body = response.body.decode('utf-8')
url = response.url
domain = url.split('/')[2]
results = []
for x in terms:
t = re.findall(x, response_body, re.IGNORECASE)
result2 = keyword()
result2["url"] = url
result2["domain"] = domain
result2["term"] = x
result2["matches"] = len(t)
results.append(result2)
print(results) #print sort of works
# yield results #Does not work...
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
通过更仔细地重写 parse
方法解决了这个问题。
博客 post 提供了基本思想:为您需要的每个关键字循环响应正文。但不是使用 for 循环,而是使用列表推导式来构建匹配列表与 yield
def parse(self, response):
#yield the response url
#and the keyword matches...
response_body = response.body.decode('utf-8')
url = response.url
domain = url.split('/')[2]
item = SfscrapeItem()
item['url'] = url
item['domain'] = domain
item['status'] = response.status
item['matches'] = [str(len(re.findall(keyword, response_body, re.IGNORECASE))) for keyword in keywords]
yield item