Python,Scrapy Pipeline csv out问题,for循环出错
Python, Scrapy Pipeline csv out problem, error in for loop
我正在 Google 使用 scrapy 进行搜索抓取。这是代码,它可以很好地获取搜索结果。
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[@id="main"]')
for page in all_page:
title = page.xpath('//*[@id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[@id="main"]/div/div/div/a/@href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
我的下一步是在 Scrapy 上使用“管道”为结果保存一个 csv 文件。
这是我到目前为止编写的代码。
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
这是修改了我的蜘蛛代码。
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[@id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[@id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[@id="main"]/div/div/div/a/@href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
错误在:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
我收到这个错误:
for title, link in zip(title, link):
UnboundLocalError: local variable 'title' referenced before assignment
根据您的评论,这是工作输出。
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[@id="main"]')
for page in all_page:
titles = page.xpath('//*[@id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[@id="main"]/div/div/div/a/@href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item
我正在 Google 使用 scrapy 进行搜索抓取。这是代码,它可以很好地获取搜索结果。
GoogleBot.py:
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
item = {}
all_page = response.xpath('//*[@id="main"]')
for page in all_page:
title = page.xpath('//*[@id="main"]/div/div/div/a/h3/div/text()').extract()
link = page.xpath('//*[@id="main"]/div/div/div/a/@href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
我的下一步是在 Scrapy 上使用“管道”为结果保存一个 csv 文件。 这是我到目前为止编写的代码。
setting.py:
ITEM_PIPELINES = {'GoogleScrapy.pipelines.GooglePipeline': 300,}
pipelines.py:
from scrapy.exporters import CsvItemExporter
class GooglePipeline(object):
def __init__(self):
self.file = open("GoogleSearchResult.csv", 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
这是修改了我的蜘蛛代码。
GoogleBot.py:
def parse(self, response):
item = {}
all_page = response.xpath('//*[@id="main"]')
for page in all_page:
item['title'] = page.xpath('//*[@id="main"]/div/div/div/a/h3/div/text()').extract()
item['link'] = page.xpath('//*[@id="main"]/div/div/div/a/@href').extract()
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
yield item
错误在:
for title, link in zip(title, link):
print(title)
print(link.lstrip("/url?q="))
我收到这个错误:
for title, link in zip(title, link): UnboundLocalError: local variable 'title' referenced before assignment
根据您的评论,这是工作输出。
import scrapy
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = ['https://www.google.com/search?q=apple&hl=en&rlz=&start=0']
def parse(self, response):
all_page = response.xpath('//*[@id="main"]')
for page in all_page:
titles = page.xpath('//*[@id="main"]/div/div/div/a/h3/div//text()').extract()
for title in titles:
links = page.xpath('//*[@id="main"]/div/div/div/a/@href').extract()
for link in links:
item={
'Title': title,
'Link':link
}
yield item