这个 scrapy 蜘蛛怎么了?只擦最后 url
whats wrong with this scrapy spider? scrapes only last url
在方法 parse()
中,蜘蛛抓取了 4 url 秒,然后发送到方法 parse_dir_contents()
来抓取一些数据,但只有第 4 个 url 被抓取我没有明白为什么它不抓取其他 3 url 了吗?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[@id="seo-dir"]/div/div/div/ul/li/a/@href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[@id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[@id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[@id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
通过检查页面,我认为 parse_dir_contents
函数中不需要 for
循环。像这样创建函数:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[@id="name"]/text()').extract()
item['headline'] = response.xpath('//*[@id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
并检查这是否解决了您的问题。
在方法 parse()
中,蜘蛛抓取了 4 url 秒,然后发送到方法 parse_dir_contents()
来抓取一些数据,但只有第 4 个 url 被抓取我没有明白为什么它不抓取其他 3 url 了吗?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[@id="seo-dir"]/div/div/div/ul/li/a/@href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[@id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[@id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[@id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
通过检查页面,我认为 parse_dir_contents
函数中不需要 for
循环。像这样创建函数:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[@id="name"]/text()').extract()
item['headline'] = response.xpath('//*[@id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[@id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
并检查这是否解决了您的问题。