如何使用 python 和 scrapy 将多个网页抓取的数据输出到 csv 文件
how to output multiple webpages crawled data into csv file using python with scrapy
我有下面的代码,可以从网站上抓取所有可用页面。这完全是对有效页面的“抓取”,因为当我使用打印功能时 - 我可以看到来自“'items'”列表的数据,但是当我尝试使用“.csv”作为转储统计信息的目标文件。 (在命令提示符下使用此命令:`scrapy crawl craig -o test.csv -t csv`),..
请帮我将数据输出到 `csv` 文件中。
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
#for u in URL:
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[@class='thumb']")
if not titles:
raise CloseSpider('No more pages')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/@title").extract()
item ["url"] = titles.select("a/@href").extract()
items.append(item)
yield items
self.page_number += 1
yield Request(URL % self.page_number)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
def start_requests(self):
for i in range(10):
yield Request(URL % i, callback=self.parse)
def parse(self, response):
titles = response.xpath("//div[@class='thumb']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = CraigslistSampleItem()
item ["title"] = title.xpath("./a/@title").extract()
item ["url"] = title.xpath("./a/@href").extract()
yield item
我有下面的代码,可以从网站上抓取所有可用页面。这完全是对有效页面的“抓取”,因为当我使用打印功能时 - 我可以看到来自“'items'”列表的数据,但是当我尝试使用“.csv”作为转储统计信息的目标文件。 (在命令提示符下使用此命令:`scrapy crawl craig -o test.csv -t csv`),.. 请帮我将数据输出到 `csv` 文件中。
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
#for u in URL:
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[@class='thumb']")
if not titles:
raise CloseSpider('No more pages')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/@title").extract()
item ["url"] = titles.select("a/@href").extract()
items.append(item)
yield items
self.page_number += 1
yield Request(URL % self.page_number)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
def start_requests(self):
for i in range(10):
yield Request(URL % i, callback=self.parse)
def parse(self, response):
titles = response.xpath("//div[@class='thumb']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = CraigslistSampleItem()
item ["title"] = title.xpath("./a/@title").extract()
item ["url"] = title.xpath("./a/@href").extract()
yield item