Scrapy Spider 无法使用 xpath 提取网页内容

Question

我有scrapy spider，我正在使用xpath选择器提取页面内容，请检查我哪里出错了

from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector 
from scrapy import Request  


class MySpider(CrawlSpider):
      name = "medical"
      allowed_domains = ["yananow.org"]
      start_urls = ["http://yananow.org/query_stories.php"]

rules = (
    Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),       
    )

def parse_items(self, response):
    hxs = HtmlXPathSelector(response)
    titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
    items = []
    for title in titles:
        item = MedicalprojectItem()
        item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
        item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
        items.append(item)
    return(items)

Answer 1

你的代码有很多问题，所以这里有一个不同的方法。

我选择反对 CrawlSpider 以更好地控制抓取过程。尤其是从查询页面抓取 name 和从详细信息页面抓取故事。

我试图通过不深入（嵌套）table 结构而是寻找内容模式来简化 XPath 语句。所以如果你想提取一个故事......必须有一个 link 到一个故事。

下面是测试代码（带注释）：

# -*- coding: utf-8 -*-
import scrapy

class MyItem(scrapy.Item):
    name = scrapy.Field()
    story = scrapy.Field()

class MySpider(scrapy.Spider):

    name = 'medical'
    allowed_domains = ['yananow.org']
    start_urls = ['http://yananow.org/query_stories.php']

    def parse(self, response):

        rows = response.xpath('//a[contains(@href,"display_story")]')

        #loop over all links to stories
        for row in rows:
            myItem = MyItem() # Create a new item
            myItem['name'] = row.xpath('./text()').extract() # assign name from link
            story_url = response.urljoin(row.xpath('./@href').extract()[0]) # extract url from link
            request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
            request.meta['myItem'] = myItem # pass the item with the request
            yield request

    def parse_detail(self, response):
        myItem = response.meta['myItem'] # extract the item (with the name) from the response
        text_raw = response.xpath('//font[@size=3]//text()').extract() # extract the story (text)
        myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
        yield myItem # return the item

Scrapy Spider 无法使用 xpath 提取网页内容

Scrapy Spider cannot Extract contents of web page using xpath

python

xpath

web-crawler

scrapy