python scrapy 只拉第一行，但重复 table 中正确数量的项目

Question

如标题所说。我的 scrapy 代码似乎是运行正确的，只是它只拉出 table 的第一行并重复 table.

中的行数

import scrapy


class FightersSpider(scrapy.Spider):
    name = "fighters"

    start_urls = [
        'http://www.ufcstats.com/statistics/fighters?char=a&page=all'
    ]

    def start_requests(self):
        urls = [
            'http://www.ufcstats.com/statistics/fighters?char=a&page=all'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)


    def parse(self, response, **kwargs):
        for fighter in response.xpath('//*[@class="b-statistics__table"]//tbody/tr'):
            yield {
                'first': fighter.xpath('//td[1]/a//text()').extract_first(),
                'last': fighter.xpath('//td[2]/a//text()').extract_first(),
                'nickname': fighter.xpath('//td[3]/a//text()').extract_first(),
                'height': fighter.xpath('//td[4]//text()').extract_first().strip(),
                'weight': fighter.xpath('//td[5]//text()').extract_first().strip(),
                'reach': fighter.xpath('//td[6]//text()').extract_first().strip(),
                'stance': fighter.xpath('//td[7]//text()').extract_first().strip(),
                'wins': fighter.xpath('//td[8]//text()').extract_first().strip(),
                'losses': fighter.xpath('//td[9]//text()').extract_first().strip(),
                'draws': fighter.xpath('//td[10]//text()').extract_first().strip(),
            }

如果我取出 _first，它会提取所有数据，但将其放在同一个单元格中并以相同的方式重复。

first   last    nickname    height  weight  reach   stance  wins    losses
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
Tom Aaron   The Assassin    --  155 lbs.    --      5   3
....

Answer 1

您必须使用 relative xpath 仅在 fighter 内搜索 - 它必须以 dot

开头

fighter.xpath('.//td[1]/a//text()')

没有 dot 它是 absolute xpath 并且它搜索所有 HTML 并且总是找到第一行。

但是你会遇到其他问题。

你得到了 table 中的所有行 - 甚至 header 没有 td - 你必须跳过它。您可以使用 [1:]

对其进行切片

for fighter in response.xpath(...)[1:]:

最少的工作代码。

您可以将所有内容复制到文件并作为普通脚本启动它 python script.py 而无需在 scrapy

中创建项目

import scrapy


class FightersSpider(scrapy.Spider):
    name = "fighters"

    start_urls = [
        'http://www.ufcstats.com/statistics/fighters?char=a&page=all'
    ]

    def start_requests(self):
        urls = [
            'http://www.ufcstats.com/statistics/fighters?char=a&page=all'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)


    def parse(self, response, **kwargs):
        for fighter in response.xpath('//*[@class="b-statistics__table"]//tbody/tr')[1:]:
            print(fighter.xpath('.//td[4]'))
            yield {
                'first': fighter.xpath('.//td[1]/a//text()').extract_first(),
                'last': fighter.xpath('.//td[2]/a//text()').extract_first(),
                'nickname': fighter.xpath('.//td[3]/a//text()').extract_first(),
                'height': fighter.xpath('.//td[4]//text()').extract_first().strip(),
                'weight': fighter.xpath('.//td[5]//text()').extract_first().strip(),
                'reach': fighter.xpath('.//td[6]//text()').extract_first().strip(),
                'stance': fighter.xpath('.//td[7]//text()').extract_first().strip(),
                'wins': fighter.xpath('.//td[8]//text()').extract_first().strip(),
                'losses': fighter.xpath('.//td[9]//text()').extract_first().strip(),
                'draws': fighter.xpath('.//td[10]//text()').extract_first().strip(),
            }
            
# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})
c.crawl(FightersSpider)
c.start()

python scrapy 只拉第一行，但重复 table 中正确数量的项目

python scrapy only pulling first row but repeating for the correct amount of items in the table

python

scrapy