来自 txt 文件的 scrapy start_urls
scrapy start_urls from txt file
我有大约 100K urls 可以抓取,所以我想从 txt 文件中读取它们
这是代码
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
class ConadstoresSpider(scrapy.Spider):
name = 'conadstores'
headers = {'user_agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
allowed_domains = ['conad.it']
#start_urls = ['http://www.conad.it/ricerca-negozi/negozio.002781.html','https://www.conad.it/ricerca-negozi/negozio.006804.html']
#start_urls = [l.strip() for l in open("/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt").readlines()]
#f = open("/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt")
#start_urls = [url.strip() for url in f.readlines()]
#f.close()
with open('/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
yield {
'address' : response.css('.address-oswald::text').extract(),
'phone' : response.css('span.phone::text').extract(),
}
但我一直收到此错误
2021-12-08 13:27:48 [scrapy.core.engine] 错误:获取启动请求时出错
追溯(最近一次通话):
文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/core/engine.py”,第 127 行,在 _next_request 中
请求=下一个(slot.start_requests)
文件“/Users/macbook/PycharmProjects/conad/conad/conad/middlewares.py”,第 52 行,在 process_start_requests 中
对于 start_requests 中的 r:
文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/spiders/init.py”,第 83 行,在 start_requests
yield Request(url, dont_filter=True)
文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/http/request/init.py”,第 25 行,在 init 中
self._set_url(url)
文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/http/request/init.py”,第 62 行,在 _set_url 中
提高 ValueError('Missing scheme in request url: %s' % self._url)
ValueError:请求中缺少方案 url:%7B%5Crtf1%5Cansi%5Cansicpg1252%5Ccocoartf2580
有什么想法吗?
谢谢!
我们可以覆盖蜘蛛start_requests()方法中的start_urls逻辑
这是提取数据的简单方法
import scrapy
class ConadstoresSpider(scrapy.Spider):
name = 'conadstores'
def start_requests(self):
# read file data (you can use different logic for extract URLS from text files)
a_file = open("/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt")
file_contents = a_file.read()
contents_split = file_contents.splitlines()
# extract urls from text file and store in list
for url in contents_split:
# send request to extracted URL.
yield scrapy.Request(url)
def parse(self, response, **kwargs):
yield {
'address': response.css('.address-oswald::text').extract(),
'phone': response.css('span.phone::text').extract(),
}
您可以使用不同的文件读取逻辑,但请确保它是 return url 列表。
我有大约 100K urls 可以抓取,所以我想从 txt 文件中读取它们 这是代码
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
class ConadstoresSpider(scrapy.Spider):
name = 'conadstores'
headers = {'user_agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
allowed_domains = ['conad.it']
#start_urls = ['http://www.conad.it/ricerca-negozi/negozio.002781.html','https://www.conad.it/ricerca-negozi/negozio.006804.html']
#start_urls = [l.strip() for l in open("/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt").readlines()]
#f = open("/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt")
#start_urls = [url.strip() for url in f.readlines()]
#f.close()
with open('/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
def parse(self, response):
yield {
'address' : response.css('.address-oswald::text').extract(),
'phone' : response.css('span.phone::text').extract(),
}
但我一直收到此错误
2021-12-08 13:27:48 [scrapy.core.engine] 错误:获取启动请求时出错 追溯(最近一次通话): 文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/core/engine.py”,第 127 行,在 _next_request 中 请求=下一个(slot.start_requests) 文件“/Users/macbook/PycharmProjects/conad/conad/conad/middlewares.py”,第 52 行,在 process_start_requests 中 对于 start_requests 中的 r: 文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/spiders/init.py”,第 83 行,在 start_requests yield Request(url, dont_filter=True) 文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/http/request/init.py”,第 25 行,在 init 中 self._set_url(url) 文件“/Users/macbook/PycharmProjects/conad/venv/lib/python3.9/site-packages/scrapy/http/request/init.py”,第 62 行,在 _set_url 中 提高 ValueError('Missing scheme in request url: %s' % self._url) ValueError:请求中缺少方案 url:%7B%5Crtf1%5Cansi%5Cansicpg1252%5Ccocoartf2580
有什么想法吗? 谢谢!
我们可以覆盖蜘蛛start_requests()方法中的start_urls逻辑
这是提取数据的简单方法
import scrapy
class ConadstoresSpider(scrapy.Spider):
name = 'conadstores'
def start_requests(self):
# read file data (you can use different logic for extract URLS from text files)
a_file = open("/Users/macbook/PycharmProjects/conad/conad/conadlinks.txt")
file_contents = a_file.read()
contents_split = file_contents.splitlines()
# extract urls from text file and store in list
for url in contents_split:
# send request to extracted URL.
yield scrapy.Request(url)
def parse(self, response, **kwargs):
yield {
'address': response.css('.address-oswald::text').extract(),
'phone': response.css('span.phone::text').extract(),
}
您可以使用不同的文件读取逻辑,但请确保它是 return url 列表。