Scrapy 在第一个结果后结束
Scrapy ends after first result
我一直在四处寻找,找不到我要找的答案。我让我的爬虫 (scrapy) return 结果接近我正在寻找的结果。所以我现在要做的是让它从页面中提取多个结果。目前它拉第一个并停止。如果我取消 extract_first() 然后它会提取所有数据并将它们分组。因此,寻找可行的 2 个答案之一。
1) 继续抓取结果未结束
2) 将每个项目取消分组到一个新的结果行
这是我的代码:
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()[1]').extract_first(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()[1]').extract_first(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()[1]').extract_first(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()[1]').extract_first(),
'Link' : houses.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[4]/div[1]/button[1]/text()').extract_first(),
'Link' : houses.xpath('.//a/@href[1]').extract_first(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()[1]').extract_first()
}
提前致谢!
编辑::
经过更多研究后,我重新定义了要爬入的容器,这给了我所有正确的答案。现在我的问题是如何获取页面上的每个项目,而不仅仅是第一个结果……它只是不循环。这是我的代码:
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[1]/div[2]'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()').extract(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()').extract(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()').extract(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()').extract(),
'Link' : houses.xpath('.//a/@href').extract(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()').extract()
}
在我看来你应该改用这个 xpath:
//div[@class="job_content"]
因为这就是您要查找的 div 中的 class。当我为此页面执行它时,我返回了 20 div 个元素。但是,您可能希望向 xpath 查询添加更多过滤,以防万一有其他 div 具有您不想解析的 class 名称。
我一直在四处寻找,找不到我要找的答案。我让我的爬虫 (scrapy) return 结果接近我正在寻找的结果。所以我现在要做的是让它从页面中提取多个结果。目前它拉第一个并停止。如果我取消 extract_first() 然后它会提取所有数据并将它们分组。因此,寻找可行的 2 个答案之一。
1) 继续抓取结果未结束 2) 将每个项目取消分组到一个新的结果行
这是我的代码:
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()[1]').extract_first(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()[1]').extract_first(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()[1]').extract_first(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()[1]').extract_first(),
'Link' : houses.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[4]/div[1]/button[1]/text()').extract_first(),
'Link' : houses.xpath('.//a/@href[1]').extract_first(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()[1]').extract_first()
}
提前致谢!
编辑:: 经过更多研究后,我重新定义了要爬入的容器,这给了我所有正确的答案。现在我的问题是如何获取页面上的每个项目,而不仅仅是第一个结果……它只是不循环。这是我的代码:
import scrapy
from scrapy.selector import Selector
from urlparse import urlparse
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
#from scrappy.http import HtmlResponse
class MySpider(CrawlSpider):
name = "ziprecruiter"
def start_requests(self):
allowed_domains = ["https://www.ziprecruiter.com/"]
urls = [
'https://www.ziprecruiter.com/candidate/search?search=operations+manager&location=San+Francisco%2C+CA'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for houses in response.xpath('/html/body/main/div/section/div/div[2]/div/div[2]/div[1]/article[1]/div[2]'):
yield {
'Job_title:' : houses.xpath('.//span[@class="just_job_title"]//text()').extract(),
'Company:' : houses.xpath('.//a[@class="t_org_link name"]//text()').extract(),
'Location:' : houses.xpath('.//a[@class="t_location_link location"]//text()').extract(),
'FT/PT:' : houses.xpath('.//span[@class="data_item"]//text()').extract(),
'Link' : houses.xpath('.//a/@href').extract(),
'pay' : houses.xpath('./section[@class="perks_item"]/span[@class="data_item"]//text()').extract()
}
在我看来你应该改用这个 xpath:
//div[@class="job_content"]
因为这就是您要查找的 div 中的 class。当我为此页面执行它时,我返回了 20 div 个元素。但是,您可能希望向 xpath 查询添加更多过滤,以防万一有其他 div 具有您不想解析的 class 名称。