从嵌套的锚标签中抓取 url 和标题

scraping url and title from nested anchor tag

这是我第一个使用 scrapy 的爬虫。

我正在尝试从 https://www.google.co.in/trends/hotvideos#hvsm=0 站点删除视频 url 的标题。

import scrapy
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

class CraigslistItem(Item):
    title = Field()
    link = Field()

class DmozSpider(scrapy.Spider):
    name = "google"
    allowed_domains = ["google.co.in"]
    start_urls = [
        "https://www.google.co.in/trends/hotvideos#hvsm=0"
    ]

    def parse(self, response):
        #for sel in response.xpath('//body/div'):
    hxs = HtmlXPathSelector(response)
    sites = hxs.xpath("//span[@class='single-video-image-container']")
    items = []
    for sel in response.xpath("//span[@class='single-video-image-container']"):
        item = CraigslistItem()
        item['title'] = sel.xpath('a/text()').extract()
        item['link'] = sel.xpath('a/@href').extract()   
        items.append(item)
        print items

大致了解我做错了什么。

使用帮助 Scrapy FormRequest 完成它。

from scrapy.http import FormRequest
import json

class DmozSpider(scrapy.Spider):
    name = "google"
    allowed_domains = ["google.co.in"]
    start_urls = [
        "https://www.google.co.in/trends/hotvideos#hvsm=0"
    ]

    def parse(self, response):
        url = 'https://www.google.co.in/trends/hotvideos/hotItems'
        formdata = {'hvd':'','geo': 'IN','mob': '0','hvsm': '0'}
        yield FormRequest(url=url, formdata=formdata, callback=self.parse_data)

    def parse_data(self, response):
        json_response = json.loads(response.body)
        videos = json_response.get('videoList')
        for video in videos:
            item = CraigslistItem()
            item['title'] = video.get('title')
            item['link'] = video.get('url')
            yield item