Scrapy 收集数据，但不保存到项目中

Question

我构建了一个蜘蛛程序，它可以从股票拥有的尽可能多的页面（这可以是 1 页股票数据，或者来自 Yahoo! Finance 的 20 页）获取给定股票的股票数据。它很好地抓取了所有页面，并按应有的方式收集了所有数据。但是，出于某种原因，它不会将任何数据保存到实际的 scrapy 项目中，因此我可以将其导出为 csv。

请参阅本问题底部的更新！

我现在将向您展示代码，然后是打印输出的示例：

代码：

class DmozSpider(Spider):


    name = "dnot"
    allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
    start_urls = ['http://finance.yahoo.com/q?s=CAT']


   def stocks1(self, response):

        current_page = response.url
        print current_page
        # If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
        if initial_ending not in current_page[-iel:]:
            returns_pages = response.meta.get('returns_pages')
            # Remove the last stock price from the stock list, because it is the same as the first on the new list
            if not not returns_pages:
                if len(returns_pages) > 2:
                    returns_pages = returns_pages[:-1]
        else:
            # Else, if the link does match that of the first page, create a new list becuase one does not exist yet
            returns_pages = []

        # This grabs the stock data from the page
        rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
        print "stocks1"
        print returns_pages
        for row in rows:
            cells = row.xpath('.//td/text()').extract()
            try:
                values = cells[-1]
                try:
                    float(values)
                    # And adds it to returns_pages
                    returns_pages.append(values)
                except ValueError:
                    continue
            except ValueError:
                continue  
        print "after"
        print returns_pages 

        # exp determines if there is a 'Next page' or not
        exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
        # If there is a 'Next Page':
        if not not exp: 
            # And this is the first page:
            if initial_ending in current_page[-iel:]:
                #create necessary url for the 2nd page
                next_page = current_page + "&z=66&y=66"
            # If this is not the first page
            else:
                # This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
                u = int(current_page[-6:].split("=",1)[1])
                o = len(str(u))
                u += 66 
                next_page = current_page[:-o] + str(u)
                print next_page, "66&y in curr_page"
            # Then go back to self.stocks1 to get more data on the next page
            yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages})
        # Else, if there is no 'Next Link'
        else: 
            # Send the retuns to finalize.stock to be saved in the item
            yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})

    def stocks2(self, response):

        # Prints the link of the current url
        current_page = response.url
        print current_page

        # Gets the returns from the previous page
        returns_pages = response.meta.get('returns_pages')
        # Removes the last return from the previous page because it will be a duplicate
        returns_pages = returns_pages[:-1]
        print "stocks2"
        print returns_pages
        # Gets all of the returns on the page
        rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
        for row in rows:
            cells = row.xpath('.//td/text()').extract()
            try:
                values = cells[-1]
                try:
                    float(values)
                    # And adds it to the previous returns
                    returns_pages.append(values)
                except ValueError:
                    continue
            except ValueError:
                continue  

        print "after 2"
        print returns_pages

        # exp determines if there is a 'Next page' or not
        exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract() 
        # If there is a 'Next Page':
        if not not exp:
            # And somehow, this is the first page (should never be true)
            if initial_ending in current_page[-iel:]:
                # Add necessary link to go to the second page
                next_page = current_page + "&z=66&y=66"
                print next_page, "66&y not in curr_page"
            # Else, this is not the first page (should always be true)
            else:
                # add 66 to the last number on the preceeding link in order to access the second or later pages
                u = int(current_page[-6:].split("=",1)[1])
                o = len(str(u))
                u += 66 
                next_page = current_page[:-o] + str(u)
                print next_page, "66&y in curr_page"
            # go back to self.stocks1 to get more data on the next page
            yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages}) 
        else: 
            # If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
            yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages}) 
            print "sending to finalize stock"

    def finalize_stock(self,response):

        unformatted_returns = response.meta.get('returns_pages')
        returns = [float(i) for i in returns]
        global required_amount_of_returns, counter
        if counter == 1 and "CAT" in response.url:
            required_amount_of_returns = len(returns)
        elif required_amount_of_returns == 0:
            raise CloseSpider("'Error with initiating required amount of returns'")

        counter += 1
        print counter

        # Iterator to calculate Rate of return 
        # ====================================
        if data_intervals == "m": 
            k = 12
        elif data_intervals == "w":
            k = 4
        else: 
            k = 30

        sub_returns_amount = required_amount_of_returns - k
        sub_returns = returns[:sub_returns_amount]
        rate_of_return = []
        RFR = Risk_free_rate

        # Make sure list is exact length, otherwise rate_of_return will be inaccurate
        # Returns has not been checked by pipeline yet, so small lists will be in the variable

        if len(returns) == required_amount_of_returns or "CAT" in response.url:
            for number in sub_returns:
                numerator = number - returns[k]
                rate = numerator/returns[k]
                if rate == '': 
                    rate = 0
                rate_of_return.append(rate)
                k += 1

        item = Website()
        items = []
        item['url'] = response.url
        item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
        item['avg_returns'] = numpy.average(rate_of_return)
        item['var_returns'] = numpy.cov(rate_of_return)
        item['sd_returns'] = numpy.std(rate_of_return)
        item['returns'] = unformatted_returns
        item['rate_of_returns'] = rate_of_return
        item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
        item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
        items.append(item)
        yield item

我试图对所有内容进行评论，以便阅读本文的任何人都能理解。

工作原理：

本质上，它转到给定的股票并获取第一页上显示的股票数据。然后它会查看是否有 'next page' link。如果有，它将传递给 stocks2，如果还有另一个 next page，它将传递回 stocks1，并继续这样做，直到没有更多页面。一旦没有更多的页面，它将把数据发送到 finalize_stock，它应该保存所有这些数据，以及对于这个问题的目的无关紧要的操作数据。

输出：（小样本）

print current_page
http://finance.yahoo.com/q/hp?s=PZA.TO&a=04&b=19&c=2005&d=04&e=19&f=2006&g=d&z=66&y=198
print "stocks 2"
stocks2
# print returns_pages | before scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85']
# print "after 2"
after 2
#Print returns_pages | after scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85', u'4.85', u'4.86', u'4.92', u'4.93', u'4.92', u'4.95', u'4.93', u'4.94', u'4.95', u'4.96', u'4.95', u'4.95', u'4.95', u'4.95', u'4.98', u'4.97', u'4.92', u'4.94', u'4.90', u'4.93', u'4.93', u'4.97', u'4.97', u'4.97', u'4.90', u'5.00', u'5.02', u'5.11', u'5.12', u'5.12']
2015-05-25 17:41:46-0700 [dnot] DEBUG: Crawled (200) <GET http://finance.yahoo.com/lookup?s=PVS.PR.D.TO> (referer: http://eoddata.com/stocklist/TSX/P.htm)
2015-05-25 17:41:46-0700 [dnot] DEBUG: Redirecting (301) to <GET http://finance.yahoo.com/lookup?s=PUD.B.TO> from <GET http://finance.yahoo.com/lookup;_ylc=X3oDMTF2cTUxaTdhBGtleXcDUFVELkIuVE8EbWlkA21lZGlhcXVvdGVzc2VhcmNoBHNlYwNnZXRxdW90ZXNidG4Ec2xrA2xvb2t1cA--?s=PUD.B.TO>
sending to finalize stock # See here, it does call the def finalize_stock function

但是，没有任何内容保存到项目中。通常 scrapy 会在保存时打印项目，但它不会这样做，我不明白为什么。

如果您需要任何其他信息，请尽管询问，我会尽快 post。

更新：发现问题但未解决：

之前，在 def stocks1 和 def stocks2 我有：

        else: 
        # If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
        yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages}) 
        print "sending to finalize stock"

在每个功能的最下方，基本意思是当没有下一页时，将信息发送到finalize_stock并保存信息。 print "sending to finalize stock" 确实得到打印，但是：

def finalize_stock（自我，回应）：

print "====================="
print "finalize_stock called"
print "====================="

永远不会打印出来！所以出于某种原因，def finalize_stock 永远不会运行，我不知道为什么。

Answer 1

您的代码看起来非常复杂且难以调试。我认为'link'创建不需要多次回调和计算。

应该简化很多东西，以便更容易调试。查看以下（经过测试的）代码并随意使用任何有用的部分：

import scrapy

class ValueItem(scrapy.Item):
    value = scrapy.Field()

class StockSpider(scrapy.Spider):

    name = "yahoo_stock_spider"
    allowed_domains = ['finance.yahoo.com']
    start_urls = ['http://finance.yahoo.com/q/hp?s=CAT&a=00&b=1&c=2015&d=04&e=26&f=2015&g=d' ]

    def parse(self, response):

        if 'item' in response.meta:
            # If the response contains a 'item' from a previous page unwrap it
            item = response.meta['item']
        else:
            # if it contains no such item, it's the first page, so let's create it
            item = ValueItem()
            item['value'] = ['']


        # Loop over the table rows
        rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table//tr')
        for row in rows[1:]:

            cell_values = row.xpath('.//td/text()').extract()
            item['value'] = item['value'] + [cell_values[-1]]


        # Check if there is a 'Next' link
        xpath_Next_Page = './/a[contains(.,"Next")]/@href'
        if response.xpath(xpath_Next_Page):
            # No need to calculate offset values. Just take the link ...
            next_page_href = response.xpath(xpath_Next_Page).extract()[0]
            url_next_page = 'http://finance.yahoo.com' + next_page_href
            # ... build the request ...
            request = scrapy.Request(url_next_page, callback=self.parse)
            # ... and add the item with the collected values to the request
            request.meta['item'] = item
            yield request
        else:
            # No more 'Next'
            # here simple output of uncleaned values
            yield item

Scrapy 收集数据，但不保存到项目中

Scrapy gathers data, but does not save it into the item

python

scrapy