Scrapy 收集数据,但不保存到项目中
Scrapy gathers data, but does not save it into the item
我构建了一个蜘蛛程序,它可以从股票拥有的尽可能多的页面(这可以是 1 页股票数据,或者来自 Yahoo! Finance 的 20 页)获取给定股票的股票数据。它很好地抓取了所有页面,并按应有的方式收集了所有数据。但是,出于某种原因,它不会将任何数据保存到实际的 scrapy 项目中,因此我可以将其导出为 csv。
请参阅本问题底部的更新!
我现在将向您展示代码,然后是打印输出的示例:
代码:
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
def stocks1(self, response):
current_page = response.url
print current_page
# If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
if initial_ending not in current_page[-iel:]:
returns_pages = response.meta.get('returns_pages')
# Remove the last stock price from the stock list, because it is the same as the first on the new list
if not not returns_pages:
if len(returns_pages) > 2:
returns_pages = returns_pages[:-1]
else:
# Else, if the link does match that of the first page, create a new list becuase one does not exist yet
returns_pages = []
# This grabs the stock data from the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
print "stocks1"
print returns_pages
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to returns_pages
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And this is the first page:
if initial_ending in current_page[-iel:]:
#create necessary url for the 2nd page
next_page = current_page + "&z=66&y=66"
# If this is not the first page
else:
# This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# Then go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages})
# Else, if there is no 'Next Link'
else:
# Send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
def stocks2(self, response):
# Prints the link of the current url
current_page = response.url
print current_page
# Gets the returns from the previous page
returns_pages = response.meta.get('returns_pages')
# Removes the last return from the previous page because it will be a duplicate
returns_pages = returns_pages[:-1]
print "stocks2"
print returns_pages
# Gets all of the returns on the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to the previous returns
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after 2"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And somehow, this is the first page (should never be true)
if initial_ending in current_page[-iel:]:
# Add necessary link to go to the second page
next_page = current_page + "&z=66&y=66"
print next_page, "66&y not in curr_page"
# Else, this is not the first page (should always be true)
else:
# add 66 to the last number on the preceeding link in order to access the second or later pages
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages})
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
print "sending to finalize stock"
def finalize_stock(self,response):
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
RFR = Risk_free_rate
# Make sure list is exact length, otherwise rate_of_return will be inaccurate
# Returns has not been checked by pipeline yet, so small lists will be in the variable
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = unformatted_returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item
我试图对所有内容进行评论,以便阅读本文的任何人都能理解。
工作原理:
本质上,它转到给定的股票并获取第一页上显示的股票数据。然后它会查看是否有 'next page' link。如果有,它将传递给 stocks2
,如果还有另一个 next page
,它将传递回 stocks1
,并继续这样做,直到没有更多页面。一旦没有更多的页面,它将把数据发送到 finalize_stock
,它应该保存所有这些数据,以及对于这个问题的目的无关紧要的操作数据。
输出:(小样本)
print current_page
http://finance.yahoo.com/q/hp?s=PZA.TO&a=04&b=19&c=2005&d=04&e=19&f=2006&g=d&z=66&y=198
print "stocks 2"
stocks2
# print returns_pages | before scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85']
# print "after 2"
after 2
#Print returns_pages | after scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85', u'4.85', u'4.86', u'4.92', u'4.93', u'4.92', u'4.95', u'4.93', u'4.94', u'4.95', u'4.96', u'4.95', u'4.95', u'4.95', u'4.95', u'4.98', u'4.97', u'4.92', u'4.94', u'4.90', u'4.93', u'4.93', u'4.97', u'4.97', u'4.97', u'4.90', u'5.00', u'5.02', u'5.11', u'5.12', u'5.12']
2015-05-25 17:41:46-0700 [dnot] DEBUG: Crawled (200) <GET http://finance.yahoo.com/lookup?s=PVS.PR.D.TO> (referer: http://eoddata.com/stocklist/TSX/P.htm)
2015-05-25 17:41:46-0700 [dnot] DEBUG: Redirecting (301) to <GET http://finance.yahoo.com/lookup?s=PUD.B.TO> from <GET http://finance.yahoo.com/lookup;_ylc=X3oDMTF2cTUxaTdhBGtleXcDUFVELkIuVE8EbWlkA21lZGlhcXVvdGVzc2VhcmNoBHNlYwNnZXRxdW90ZXNidG4Ec2xrA2xvb2t1cA--?s=PUD.B.TO>
sending to finalize stock # See here, it does call the def finalize_stock function
但是,没有任何内容保存到项目中。通常 scrapy 会在保存时打印项目,但它不会这样做,我不明白为什么。
如果您需要任何其他信息,请尽管询问,我会尽快 post。
更新:发现问题但未解决:
之前,在 def stocks1
和 def stocks2
我有:
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
print "sending to finalize stock"
在每个功能的最下方,基本意思是当没有下一页时,将信息发送到finalize_stock
并保存信息。 print "sending to finalize stock"
确实得到打印,但是:
def finalize_stock(自我,回应):
print "====================="
print "finalize_stock called"
print "====================="
永远不会打印出来!所以出于某种原因,def finalize_stock
永远不会运行,我不知道为什么。
您的代码看起来非常复杂且难以调试。我认为'link'创建不需要多次回调和计算。
应该简化很多东西,以便更容易调试。查看以下(经过测试的)代码并随意使用任何有用的部分:
import scrapy
class ValueItem(scrapy.Item):
value = scrapy.Field()
class StockSpider(scrapy.Spider):
name = "yahoo_stock_spider"
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=CAT&a=00&b=1&c=2015&d=04&e=26&f=2015&g=d' ]
def parse(self, response):
if 'item' in response.meta:
# If the response contains a 'item' from a previous page unwrap it
item = response.meta['item']
else:
# if it contains no such item, it's the first page, so let's create it
item = ValueItem()
item['value'] = ['']
# Loop over the table rows
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table//tr')
for row in rows[1:]:
cell_values = row.xpath('.//td/text()').extract()
item['value'] = item['value'] + [cell_values[-1]]
# Check if there is a 'Next' link
xpath_Next_Page = './/a[contains(.,"Next")]/@href'
if response.xpath(xpath_Next_Page):
# No need to calculate offset values. Just take the link ...
next_page_href = response.xpath(xpath_Next_Page).extract()[0]
url_next_page = 'http://finance.yahoo.com' + next_page_href
# ... build the request ...
request = scrapy.Request(url_next_page, callback=self.parse)
# ... and add the item with the collected values to the request
request.meta['item'] = item
yield request
else:
# No more 'Next'
# here simple output of uncleaned values
yield item
我构建了一个蜘蛛程序,它可以从股票拥有的尽可能多的页面(这可以是 1 页股票数据,或者来自 Yahoo! Finance 的 20 页)获取给定股票的股票数据。它很好地抓取了所有页面,并按应有的方式收集了所有数据。但是,出于某种原因,它不会将任何数据保存到实际的 scrapy 项目中,因此我可以将其导出为 csv。
请参阅本问题底部的更新!
我现在将向您展示代码,然后是打印输出的示例:
代码:
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
def stocks1(self, response):
current_page = response.url
print current_page
# If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
if initial_ending not in current_page[-iel:]:
returns_pages = response.meta.get('returns_pages')
# Remove the last stock price from the stock list, because it is the same as the first on the new list
if not not returns_pages:
if len(returns_pages) > 2:
returns_pages = returns_pages[:-1]
else:
# Else, if the link does match that of the first page, create a new list becuase one does not exist yet
returns_pages = []
# This grabs the stock data from the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
print "stocks1"
print returns_pages
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to returns_pages
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And this is the first page:
if initial_ending in current_page[-iel:]:
#create necessary url for the 2nd page
next_page = current_page + "&z=66&y=66"
# If this is not the first page
else:
# This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# Then go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages})
# Else, if there is no 'Next Link'
else:
# Send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
def stocks2(self, response):
# Prints the link of the current url
current_page = response.url
print current_page
# Gets the returns from the previous page
returns_pages = response.meta.get('returns_pages')
# Removes the last return from the previous page because it will be a duplicate
returns_pages = returns_pages[:-1]
print "stocks2"
print returns_pages
# Gets all of the returns on the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to the previous returns
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after 2"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And somehow, this is the first page (should never be true)
if initial_ending in current_page[-iel:]:
# Add necessary link to go to the second page
next_page = current_page + "&z=66&y=66"
print next_page, "66&y not in curr_page"
# Else, this is not the first page (should always be true)
else:
# add 66 to the last number on the preceeding link in order to access the second or later pages
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages})
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
print "sending to finalize stock"
def finalize_stock(self,response):
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
RFR = Risk_free_rate
# Make sure list is exact length, otherwise rate_of_return will be inaccurate
# Returns has not been checked by pipeline yet, so small lists will be in the variable
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = unformatted_returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item
我试图对所有内容进行评论,以便阅读本文的任何人都能理解。
工作原理:
本质上,它转到给定的股票并获取第一页上显示的股票数据。然后它会查看是否有 'next page' link。如果有,它将传递给 stocks2
,如果还有另一个 next page
,它将传递回 stocks1
,并继续这样做,直到没有更多页面。一旦没有更多的页面,它将把数据发送到 finalize_stock
,它应该保存所有这些数据,以及对于这个问题的目的无关紧要的操作数据。
输出:(小样本)
print current_page
http://finance.yahoo.com/q/hp?s=PZA.TO&a=04&b=19&c=2005&d=04&e=19&f=2006&g=d&z=66&y=198
print "stocks 2"
stocks2
# print returns_pages | before scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85']
# print "after 2"
after 2
#Print returns_pages | after scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85', u'4.85', u'4.86', u'4.92', u'4.93', u'4.92', u'4.95', u'4.93', u'4.94', u'4.95', u'4.96', u'4.95', u'4.95', u'4.95', u'4.95', u'4.98', u'4.97', u'4.92', u'4.94', u'4.90', u'4.93', u'4.93', u'4.97', u'4.97', u'4.97', u'4.90', u'5.00', u'5.02', u'5.11', u'5.12', u'5.12']
2015-05-25 17:41:46-0700 [dnot] DEBUG: Crawled (200) <GET http://finance.yahoo.com/lookup?s=PVS.PR.D.TO> (referer: http://eoddata.com/stocklist/TSX/P.htm)
2015-05-25 17:41:46-0700 [dnot] DEBUG: Redirecting (301) to <GET http://finance.yahoo.com/lookup?s=PUD.B.TO> from <GET http://finance.yahoo.com/lookup;_ylc=X3oDMTF2cTUxaTdhBGtleXcDUFVELkIuVE8EbWlkA21lZGlhcXVvdGVzc2VhcmNoBHNlYwNnZXRxdW90ZXNidG4Ec2xrA2xvb2t1cA--?s=PUD.B.TO>
sending to finalize stock # See here, it does call the def finalize_stock function
但是,没有任何内容保存到项目中。通常 scrapy 会在保存时打印项目,但它不会这样做,我不明白为什么。
如果您需要任何其他信息,请尽管询问,我会尽快 post。
更新:发现问题但未解决:
之前,在 def stocks1
和 def stocks2
我有:
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
print "sending to finalize stock"
在每个功能的最下方,基本意思是当没有下一页时,将信息发送到finalize_stock
并保存信息。 print "sending to finalize stock"
确实得到打印,但是:
def finalize_stock(自我,回应):
print "====================="
print "finalize_stock called"
print "====================="
永远不会打印出来!所以出于某种原因,def finalize_stock
永远不会运行,我不知道为什么。
您的代码看起来非常复杂且难以调试。我认为'link'创建不需要多次回调和计算。
应该简化很多东西,以便更容易调试。查看以下(经过测试的)代码并随意使用任何有用的部分:
import scrapy
class ValueItem(scrapy.Item):
value = scrapy.Field()
class StockSpider(scrapy.Spider):
name = "yahoo_stock_spider"
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=CAT&a=00&b=1&c=2015&d=04&e=26&f=2015&g=d' ]
def parse(self, response):
if 'item' in response.meta:
# If the response contains a 'item' from a previous page unwrap it
item = response.meta['item']
else:
# if it contains no such item, it's the first page, so let's create it
item = ValueItem()
item['value'] = ['']
# Loop over the table rows
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table//tr')
for row in rows[1:]:
cell_values = row.xpath('.//td/text()').extract()
item['value'] = item['value'] + [cell_values[-1]]
# Check if there is a 'Next' link
xpath_Next_Page = './/a[contains(.,"Next")]/@href'
if response.xpath(xpath_Next_Page):
# No need to calculate offset values. Just take the link ...
next_page_href = response.xpath(xpath_Next_Page).extract()[0]
url_next_page = 'http://finance.yahoo.com' + next_page_href
# ... build the request ...
request = scrapy.Request(url_next_page, callback=self.parse)
# ... and add the item with the collected values to the request
request.meta['item'] = item
yield request
else:
# No more 'Next'
# here simple output of uncleaned values
yield item