如何将新列添加到 csv 的 Scrapy 输出?
How to add new colum to Scrapy output from csv?
我解析网站并且它工作正常,但我需要添加带有 ID 的新列以输出。该列保存在 csv 中,网址为:
https://www.ceneo.pl/48523541, 1362
https://www.ceneo.pl/46374217, 2457
我的蜘蛛代码:
import scrapy
from ceneo.items import CeneoItem
import csv
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
start_urls = []
f = open('urls.csv', 'r')
for i in f:
u = i.split(',')
start_urls.append(u[0])
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
all_prices = response.xpath('(//td[@class="cell-price"] /a/span/span/span[@class="value"]/text())[position() <= 10]').extract()
all_sellers = response.xpath('(//tr/td/div/ul/li/a[@class="js_product-offer-link"]/text())[position()<=10]').extract()
f = open('urls.csv', 'r')
id = []
for i in f:
u = i.split(',')
id.append(u[1])
x = len(all_prices)
i = 0
while (i < x):
all_sellers[i] = all_sellers[i].replace('Opinie o ', '')
i += 1
for urlid, price, seller in zip(id, all_prices, all_sellers):
yield {'urlid': urlid.strip(), 'price': price.strip(), 'seller': seller.strip()}
在结果中我得到了错误的数据,因为(zip 函数?)ID 是交替使用的:
urlid,price,seller
1362,109,eMAG
1457,116,electro.pl
1362,597,apollo.pl
1457,597,allegro.pl
它应该输出:
urlid,price,seller
1362,109,eMAG
1362,116,electro.pl
1457,597,apollo.pl
1457,597,allegro.pl
您可以在 start_requests
中获取 ID
并使用 meta={'id': id_}
分配给请求,稍后在 parse
中您可以使用 [=16= 获取 ID
].
这样您将在 parse
中得到正确的 ID
。
我使用字符串 data
而不是文件来创建工作示例。
#!/usr/bin/env python3
import scrapy
data = '''https://www.ceneo.pl/48523541, 1362
https://www.ceneo.pl/46374217, 2457'''
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
#f = open('urls.csv', 'r')
f = data.split('\n')
for row in f:
url, id_ = row.split(',')
url = url.strip()
id_ = id_.strip()
#print(url, id_)
# use meta to assign value
yield scrapy.Request(url=url, callback=self.parse, meta={'id': id_})
def parse(self, response):
# use meta to receive value
id_ = response.meta["id"]
all_prices = response.xpath('(//td[@class="cell-price"] /a/span/span/span[@class="value"]/text())[position() <= 10]').extract()
all_sellers = response.xpath('(//tr/td/div/ul/li/a[@class="js_product-offer-link"]/text())[position()<=10]').extract()
all_sellers = [ item.replace('Opinie o ', '') for item in all_sellers ]
for price, seller in zip(all_prices, all_sellers):
yield {'urlid': id_, 'price': price.strip(), 'seller': seller.strip()}
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(QuotesSpider)
c.start()
顺便说一句:有标准函数 id()
所以我使用变量 id_
而不是 id
我解析网站并且它工作正常,但我需要添加带有 ID 的新列以输出。该列保存在 csv 中,网址为:
https://www.ceneo.pl/48523541, 1362
https://www.ceneo.pl/46374217, 2457
我的蜘蛛代码:
import scrapy
from ceneo.items import CeneoItem
import csv
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
start_urls = []
f = open('urls.csv', 'r')
for i in f:
u = i.split(',')
start_urls.append(u[0])
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
all_prices = response.xpath('(//td[@class="cell-price"] /a/span/span/span[@class="value"]/text())[position() <= 10]').extract()
all_sellers = response.xpath('(//tr/td/div/ul/li/a[@class="js_product-offer-link"]/text())[position()<=10]').extract()
f = open('urls.csv', 'r')
id = []
for i in f:
u = i.split(',')
id.append(u[1])
x = len(all_prices)
i = 0
while (i < x):
all_sellers[i] = all_sellers[i].replace('Opinie o ', '')
i += 1
for urlid, price, seller in zip(id, all_prices, all_sellers):
yield {'urlid': urlid.strip(), 'price': price.strip(), 'seller': seller.strip()}
在结果中我得到了错误的数据,因为(zip 函数?)ID 是交替使用的:
urlid,price,seller
1362,109,eMAG
1457,116,electro.pl
1362,597,apollo.pl
1457,597,allegro.pl
它应该输出:
urlid,price,seller
1362,109,eMAG
1362,116,electro.pl
1457,597,apollo.pl
1457,597,allegro.pl
您可以在 start_requests
中获取 ID
并使用 meta={'id': id_}
分配给请求,稍后在 parse
中您可以使用 [=16= 获取 ID
].
这样您将在 parse
中得到正确的 ID
。
我使用字符串 data
而不是文件来创建工作示例。
#!/usr/bin/env python3
import scrapy
data = '''https://www.ceneo.pl/48523541, 1362
https://www.ceneo.pl/46374217, 2457'''
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
#f = open('urls.csv', 'r')
f = data.split('\n')
for row in f:
url, id_ = row.split(',')
url = url.strip()
id_ = id_.strip()
#print(url, id_)
# use meta to assign value
yield scrapy.Request(url=url, callback=self.parse, meta={'id': id_})
def parse(self, response):
# use meta to receive value
id_ = response.meta["id"]
all_prices = response.xpath('(//td[@class="cell-price"] /a/span/span/span[@class="value"]/text())[position() <= 10]').extract()
all_sellers = response.xpath('(//tr/td/div/ul/li/a[@class="js_product-offer-link"]/text())[position()<=10]').extract()
all_sellers = [ item.replace('Opinie o ', '') for item in all_sellers ]
for price, seller in zip(all_prices, all_sellers):
yield {'urlid': id_, 'price': price.strip(), 'seller': seller.strip()}
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(QuotesSpider)
c.start()
顺便说一句:有标准函数 id()
所以我使用变量 id_
而不是 id