在 scrapy 中使用 css 选择器提取 href 和 link
Extract both href and link using css selectors in scrapy
我正在编写一个蜘蛛程序来从网页中提取文本和相应的超链接。这是我的蜘蛛代码:
import scrapy
class GeneralElection2019Spider(scrapy.Spider):
name = 'general_election_2019'
allowed_domains = ['https://eci.gov.in/']
start_urls = ['https://eci.gov.in/files/category/1359-general-election-2019/']
def parse(self, response):
print(f'\nProcessing: {response.url}\n')
#data = response.css('.ipsType_break.ipsContained a::attr(href)').extract() # Hyperlink
data = response.css('.ipsType_break.ipsContained a::attr(title)').extract() # Text
for row in data:
print(f'{row}\n')
我可以获取文本或超链接,但我同时想要两者。
你可以这样试试
import scrapy
class GeneralElection2019Spider(scrapy.Spider):
name = 'general_election_2019'
allowed_domains = ['eci.gov.in']
start_urls = ['https://eci.gov.in/files/category/1359-general-election-2019/']
def parse(self, response):
print(f'\nProcessing: {response.url}\n')
for data in response.css('li.ipsDataItem'):
text = data.css('span.ipsType_break.ipsContained a::attr(title)').get()
text2 = data.css('span.ipsType_break.ipsContained a::text').get()
link = data.css('span.ipsType_break.ipsContained a::attr(href)').get()
print(text)
print(text2)
print(link)
我正在编写一个蜘蛛程序来从网页中提取文本和相应的超链接。这是我的蜘蛛代码:
import scrapy
class GeneralElection2019Spider(scrapy.Spider):
name = 'general_election_2019'
allowed_domains = ['https://eci.gov.in/']
start_urls = ['https://eci.gov.in/files/category/1359-general-election-2019/']
def parse(self, response):
print(f'\nProcessing: {response.url}\n')
#data = response.css('.ipsType_break.ipsContained a::attr(href)').extract() # Hyperlink
data = response.css('.ipsType_break.ipsContained a::attr(title)').extract() # Text
for row in data:
print(f'{row}\n')
我可以获取文本或超链接,但我同时想要两者。
你可以这样试试
import scrapy
class GeneralElection2019Spider(scrapy.Spider):
name = 'general_election_2019'
allowed_domains = ['eci.gov.in']
start_urls = ['https://eci.gov.in/files/category/1359-general-election-2019/']
def parse(self, response):
print(f'\nProcessing: {response.url}\n')
for data in response.css('li.ipsDataItem'):
text = data.css('span.ipsType_break.ipsContained a::attr(title)').get()
text2 = data.css('span.ipsType_break.ipsContained a::text').get()
link = data.css('span.ipsType_break.ipsContained a::attr(href)').get()
print(text)
print(text2)
print(link)