使用 Scrapy 从 table 中抓取数据
Scraping Data from a table with Scrapy
我是第一次尝试 Scrapy。在做了相当多的研究之后,我得到了基础知识。现在我试图获取 table 的数据。它不工作。检查下面的源代码。
items.py
from scrapy.item import Item, Field
class Digi(Item):
sl = Field()
player_name = Field()
dismissal_info = Field()
bowler_name = Field()
runs_scored = Field()
balls_faced = Field()
minutes_played = Field()
fours = Field()
sixes = Field()
strike_rate = Field()
digicric.py
from scrapy.spider import Spider
from scrapy.selector import Selector
from crawler01.items import Digi
class DmozSpider(Spider):
name = "digicric"
allowed_domains = ["digicricket.marssil.com"]
start_urls = ["http://digicricket.marssil.com/match/MatchData.aspx?op=2&match=1250"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[3]/tr')
items = []
for site in sites:
item = Digi()
item['sl'] = sel.xpath('td/text()').extract()
item['player_name'] = sel.xpath('td/a/text()').extract()
item['dismissal_info'] = sel.xpath('td/text()').extract()
item['bowler_name'] = sel.xpath('td/text()').extract()
item['runs_scored'] = sel.xpath('td/text()').extract()
item['balls_faced'] = sel.xpath('td/text()').extract()
item['minutes_played'] = sel.xpath('td/text()').extract()
item['fours'] = sel.xpath('td/text()').extract()
item['sixes'] = sel.xpath('td/text()').extract()
item['strike_rate'] = sel.xpath('td/text()').extract()
items.append(item)
return items
我只是 运行 你用 Scrapy 编写的代码,它工作得很好。到底什么不适合你?
P.S。这应该是一条评论,但我还没有足够的声誉......如果有必要,我会 edit/close 相应地回答。
编辑:
我认为你应该在每个循环结束时执行 yield item
而不是 return item
。您的其余代码应该没问题。
这是来自 Scrapy documentaion 的示例:
import scrapy
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
for h3 in response.xpath('//h3').extract():
yield MyItem(title=h3)
for url in response.xpath('//a/@href').extract():
yield scrapy.Request(url, callback=self.parse)
关键问题是您在循环内使用 sel
。另一个关键问题是您的 XPath 表达式指向 td
元素,而您需要通过索引获取 td
元素并将其与 item
字段相关联。
工作解决方案:
def parse(self, response):
sites = response.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[3]/tr')[1:-2]
for site in sites:
item = Digi()
item['sl'] = site.xpath('td[1]/text()').extract()
item['player_name'] = site.xpath('td[2]/a/text()').extract()
item['dismissal_info'] = site.xpath('td[3]/text()').extract()
item['bowler_name'] = site.xpath('td[4]/text()').extract()
item['runs_scored'] = site.xpath('td[5]/b/text()').extract()
item['balls_faced'] = site.xpath('td[6]/text()').extract()
item['minutes_played'] = site.xpath('td[7]/text()').extract()
item['fours'] = site.xpath('td[8]/text()').extract()
item['sixes'] = site.xpath('td[9]/text()').extract()
item['strike_rate'] = site.xpath('td[10]/text()').extract()
yield item
它正确输出了 11 个项目实例。
我是第一次尝试 Scrapy。在做了相当多的研究之后,我得到了基础知识。现在我试图获取 table 的数据。它不工作。检查下面的源代码。
items.py
from scrapy.item import Item, Field
class Digi(Item):
sl = Field()
player_name = Field()
dismissal_info = Field()
bowler_name = Field()
runs_scored = Field()
balls_faced = Field()
minutes_played = Field()
fours = Field()
sixes = Field()
strike_rate = Field()
digicric.py
from scrapy.spider import Spider
from scrapy.selector import Selector
from crawler01.items import Digi
class DmozSpider(Spider):
name = "digicric"
allowed_domains = ["digicricket.marssil.com"]
start_urls = ["http://digicricket.marssil.com/match/MatchData.aspx?op=2&match=1250"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[3]/tr')
items = []
for site in sites:
item = Digi()
item['sl'] = sel.xpath('td/text()').extract()
item['player_name'] = sel.xpath('td/a/text()').extract()
item['dismissal_info'] = sel.xpath('td/text()').extract()
item['bowler_name'] = sel.xpath('td/text()').extract()
item['runs_scored'] = sel.xpath('td/text()').extract()
item['balls_faced'] = sel.xpath('td/text()').extract()
item['minutes_played'] = sel.xpath('td/text()').extract()
item['fours'] = sel.xpath('td/text()').extract()
item['sixes'] = sel.xpath('td/text()').extract()
item['strike_rate'] = sel.xpath('td/text()').extract()
items.append(item)
return items
我只是 运行 你用 Scrapy 编写的代码,它工作得很好。到底什么不适合你?
P.S。这应该是一条评论,但我还没有足够的声誉......如果有必要,我会 edit/close 相应地回答。
编辑:
我认为你应该在每个循环结束时执行 yield item
而不是 return item
。您的其余代码应该没问题。
这是来自 Scrapy documentaion 的示例:
import scrapy
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
for h3 in response.xpath('//h3').extract():
yield MyItem(title=h3)
for url in response.xpath('//a/@href').extract():
yield scrapy.Request(url, callback=self.parse)
关键问题是您在循环内使用 sel
。另一个关键问题是您的 XPath 表达式指向 td
元素,而您需要通过索引获取 td
元素并将其与 item
字段相关联。
工作解决方案:
def parse(self, response):
sites = response.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[3]/tr')[1:-2]
for site in sites:
item = Digi()
item['sl'] = site.xpath('td[1]/text()').extract()
item['player_name'] = site.xpath('td[2]/a/text()').extract()
item['dismissal_info'] = site.xpath('td[3]/text()').extract()
item['bowler_name'] = site.xpath('td[4]/text()').extract()
item['runs_scored'] = site.xpath('td[5]/b/text()').extract()
item['balls_faced'] = site.xpath('td[6]/text()').extract()
item['minutes_played'] = site.xpath('td[7]/text()').extract()
item['fours'] = site.xpath('td[8]/text()').extract()
item['sixes'] = site.xpath('td[9]/text()').extract()
item['strike_rate'] = site.xpath('td[10]/text()').extract()
yield item
它正确输出了 11 个项目实例。