scrapy 没有通过链接爬行
scrapy is not crawling through the links
我通过 link 提取器使用 scrapy 进行爬取,我在 scrapy link 提取器中使用了正确的 XPath 表达式,但我不知道为什么它会变得无限并打印某种来源代码而不是餐厅的名称和地址。我知道我的 restrict XPath 表达式中有一些错误,但无法弄清楚它是什么
代码:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TripadSpider(CrawlSpider):
name = 'tripad'
allowed_domains = ['www.tripadvisor.in']
start_urls = ['https://www.tripadvisor.in/Restaurants-g304551-New_Delhi_National_Capital_Territory_of_Delhi.html']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="OhCyu"]//a'), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'title': response.xpath('//h1[@class="fHibz"]/text()').get(),
'Address': response.xpath('(//a[@class="fhGHT"])[2]').get()
}
正在爬行,换你试试user_agent。但是你忘了在地址里加/text()
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TripadSpider(CrawlSpider):
name = 'tripad'
allowed_domains = ['tripadvisor.in']
start_urls = ['https://www.tripadvisor.in/Restaurants-g304551-New_Delhi_National_Capital_Territory_of_Delhi.html']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="OhCyu"]//a'), callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//a[contains(@class, "next")]')), # pagination
)
def parse_item(self, response):
yield {
'title': response.xpath('//h1[@class="fHibz"]/text()').get(),
'Address': response.xpath('(//a[@class="fhGHT"])[2]/text()').get()
}
输出:
{'title': 'Mosaic', 'Address': 'Sector 10 Lobby Level Crowne Plaza Twin District Centre, Rohini, New Delhi 110085 India'}
{'title': 'Spring', 'Address': 'Plot 4, Dwarka City Centre Radisson Blu, Sector 13, New Delhi 110075 India'}
{'title': 'Dilli 32', 'Address': 'Maharaja Surajmal Road The Leela Ambience Convention Hotel, Near Yamuna Sports Complex, Vivek Vihar, New Delhi 110002 India'}
{'title': 'Viva - All Day Dining', 'Address': 'Hospitality District Asset Area 12 Gurgoan sector 28, New Delhi 110037 India'}
...
...
...
我通过 link 提取器使用 scrapy 进行爬取,我在 scrapy link 提取器中使用了正确的 XPath 表达式,但我不知道为什么它会变得无限并打印某种来源代码而不是餐厅的名称和地址。我知道我的 restrict XPath 表达式中有一些错误,但无法弄清楚它是什么
代码:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TripadSpider(CrawlSpider):
name = 'tripad'
allowed_domains = ['www.tripadvisor.in']
start_urls = ['https://www.tripadvisor.in/Restaurants-g304551-New_Delhi_National_Capital_Territory_of_Delhi.html']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="OhCyu"]//a'), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'title': response.xpath('//h1[@class="fHibz"]/text()').get(),
'Address': response.xpath('(//a[@class="fhGHT"])[2]').get()
}
正在爬行,换你试试user_agent。但是你忘了在地址里加/text()
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TripadSpider(CrawlSpider):
name = 'tripad'
allowed_domains = ['tripadvisor.in']
start_urls = ['https://www.tripadvisor.in/Restaurants-g304551-New_Delhi_National_Capital_Territory_of_Delhi.html']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="OhCyu"]//a'), callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//a[contains(@class, "next")]')), # pagination
)
def parse_item(self, response):
yield {
'title': response.xpath('//h1[@class="fHibz"]/text()').get(),
'Address': response.xpath('(//a[@class="fhGHT"])[2]/text()').get()
}
输出:
{'title': 'Mosaic', 'Address': 'Sector 10 Lobby Level Crowne Plaza Twin District Centre, Rohini, New Delhi 110085 India'}
{'title': 'Spring', 'Address': 'Plot 4, Dwarka City Centre Radisson Blu, Sector 13, New Delhi 110075 India'}
{'title': 'Dilli 32', 'Address': 'Maharaja Surajmal Road The Leela Ambience Convention Hotel, Near Yamuna Sports Complex, Vivek Vihar, New Delhi 110002 India'}
{'title': 'Viva - All Day Dining', 'Address': 'Hospitality District Asset Area 12 Gurgoan sector 28, New Delhi 110037 India'}
...
...
...