如何将抓取的数据存储在多个 Postgresql 表中?
How to store scraped data in multiple Postregsql tables?
我有一个蜘蛛抓取一个网站,但我想将结果存储在我的 Postgresql 数据库中的两个不同 table 中。
“比赛”中有 1 人
“参与者”中有 2 人
如果我只填写一个 table,它工作正常,但我如何让 scrapy 管道一次性填写两个 table?
我试图在我的 pipelines.py 中制作两个 类,但没有成功。我想我只是想念某事。这里
嗯,这是我的代码
import logging
import psycopg2
from scrapy.loader import ItemLoader
class RacesPipeline(object):
def open_spider(self, spider):
hostname = 'localhost'
username = 'postgres'
password = '****!'
database = 'horseracing'
port = "***"
self.connection = psycopg2.connect(host=hostname, user=username, password=password,
dbname=database, port=port)
self.cur = self.connection.cursor()
def close_spider(self, spider):
self.cur.close()
self.connection.close()
def process_item(self, item, spider):
self.cur.execute("insert into races(track, date, racename, racetype, distancefinal, minalter, maxalter, raceclass, classrating, going, finalhurdle, anzahlstarter, winningtimecombined, pricemoney1, pricemoney2, pricemoney3, pricemoney4, pricemoney5, pricemoney6, pricemoney7, pricemoney8) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(
item['track'][0],
item['date'],
item['racename'][0],
item['racetype'],
item['distancefinal'],
item['minalter'],
item['maxalter'],
item['raceclass'],
item['classrating'],
item['going'][0],
item['finalhurlde'],
item['anzahlstarter'],
item['winningtimecombined'],
item['pricemoney1'],
item['pricemoney2'],
item['pricemoney3'],
item['pricemoney4'],
item['pricemoney5'],
item['pricemoney6'],
item['pricemoney7'],
item['pricemoney8']
))
self.connection.commit()
return item
class HorsesPipeline(object):
def open_spider(self, spider):
hostname = 'localhost'
username = 'postgres'
password = '********'
database = 'horseracing'
port = "****"
self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database, port=port)
self.cur = self.connection.cursor()
def close_spider(self, spider):
self.cur.close()
self.connection.close()
def process_item(self, item, spider):
self.cur.execute("insert into participants(pos, draw, dwinner, dnext, startnumber, pferde, horsecountry, odd, jockey, trainer, weightkg, alter, headgear, officalrating, rp, ts, rprc) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(
item['pos'][0],
item['draw'],
item['dwinner'],
item['dnext'],
item['startnumber'],
item['pferde'],
item['horsecountry'],
item['odd'],
item['jockey'],
item['trainer'],
item['weightkg'],
item['alter'],
item['headgear'],
item['officalrating'],
item['rp'],
item['ts'],
item['rprc']
))
self.connection.commit()
return item
以及管道设置:
ITEM_PIPELINES = {
'results.pipelines.RacesPipeline': 100,
'results.pipelines.HorsesPipeline':200,
}
如果我运行代码,我得到错误
第 33 行,在 process_item
项目['track'][0],
键错误:'track'
但是当我不尝试将两个 table 插入串在一起,而是单独测试它们时,它们 运行 就好了。并且,它插入第一个 table 就好了,即使上面的错误提示其他
我知道我只是想念某事。把它们加在一起,但我想不通
我看到你有两种不同的物品类型。您需要检查 process_item
中的项目实例并使用不同的 insert
:
from your_spider.items import RaceItem, ParticipantItem # use actual names here
if isinstance(item, RaceItem):
# insert into race
单个 process_item
将适用于两个不同的表,无需创建第二个 class
。
我有一个蜘蛛抓取一个网站,但我想将结果存储在我的 Postgresql 数据库中的两个不同 table 中。
“比赛”中有 1 人 “参与者”中有 2 人
如果我只填写一个 table,它工作正常,但我如何让 scrapy 管道一次性填写两个 table?
我试图在我的 pipelines.py 中制作两个 类,但没有成功。我想我只是想念某事。这里
嗯,这是我的代码
import logging
import psycopg2
from scrapy.loader import ItemLoader
class RacesPipeline(object):
def open_spider(self, spider):
hostname = 'localhost'
username = 'postgres'
password = '****!'
database = 'horseracing'
port = "***"
self.connection = psycopg2.connect(host=hostname, user=username, password=password,
dbname=database, port=port)
self.cur = self.connection.cursor()
def close_spider(self, spider):
self.cur.close()
self.connection.close()
def process_item(self, item, spider):
self.cur.execute("insert into races(track, date, racename, racetype, distancefinal, minalter, maxalter, raceclass, classrating, going, finalhurdle, anzahlstarter, winningtimecombined, pricemoney1, pricemoney2, pricemoney3, pricemoney4, pricemoney5, pricemoney6, pricemoney7, pricemoney8) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(
item['track'][0],
item['date'],
item['racename'][0],
item['racetype'],
item['distancefinal'],
item['minalter'],
item['maxalter'],
item['raceclass'],
item['classrating'],
item['going'][0],
item['finalhurlde'],
item['anzahlstarter'],
item['winningtimecombined'],
item['pricemoney1'],
item['pricemoney2'],
item['pricemoney3'],
item['pricemoney4'],
item['pricemoney5'],
item['pricemoney6'],
item['pricemoney7'],
item['pricemoney8']
))
self.connection.commit()
return item
class HorsesPipeline(object):
def open_spider(self, spider):
hostname = 'localhost'
username = 'postgres'
password = '********'
database = 'horseracing'
port = "****"
self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database, port=port)
self.cur = self.connection.cursor()
def close_spider(self, spider):
self.cur.close()
self.connection.close()
def process_item(self, item, spider):
self.cur.execute("insert into participants(pos, draw, dwinner, dnext, startnumber, pferde, horsecountry, odd, jockey, trainer, weightkg, alter, headgear, officalrating, rp, ts, rprc) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(
item['pos'][0],
item['draw'],
item['dwinner'],
item['dnext'],
item['startnumber'],
item['pferde'],
item['horsecountry'],
item['odd'],
item['jockey'],
item['trainer'],
item['weightkg'],
item['alter'],
item['headgear'],
item['officalrating'],
item['rp'],
item['ts'],
item['rprc']
))
self.connection.commit()
return item
以及管道设置:
ITEM_PIPELINES = {
'results.pipelines.RacesPipeline': 100,
'results.pipelines.HorsesPipeline':200,
}
如果我运行代码,我得到错误 第 33 行,在 process_item 项目['track'][0], 键错误:'track'
但是当我不尝试将两个 table 插入串在一起,而是单独测试它们时,它们 运行 就好了。并且,它插入第一个 table 就好了,即使上面的错误提示其他
我知道我只是想念某事。把它们加在一起,但我想不通
我看到你有两种不同的物品类型。您需要检查 process_item
中的项目实例并使用不同的 insert
:
from your_spider.items import RaceItem, ParticipantItem # use actual names here
if isinstance(item, RaceItem):
# insert into race
单个 process_item
将适用于两个不同的表,无需创建第二个 class
。