我如何告诉 Scrapy 抓取 start_urls 中的数据-url 而不是 url 本身?
How do i tell Scrapy to scrape a data-url that's in the start_urls instead of the urls themselves?
所以我只想让 Scrapy 抓取一个在 url 内部找到的 dat-aurl。这个数据- url 是具体的。
因为 start_urls 本身只包含部分列表,而此数据 urls 包含我需要抓取的所有内容。
<div id="loadMoreShops" class="border-b border-grey-bright py-4 text-center cursor-pointer text-sm text-blue font-semibold underline" data-url="https://www.apomio.de/preisvergleich-zeige-alle-angebote/171865">
Mehr Shops anzeigen
</div>
我当前的代码。它确实已经抓取了但不仅仅是我想要的一切:
import scrapy, bs4, re, csv, logging
from datetime import datetime
class ApomioSpider(scrapy.Spider):
allowed_domains = ['https://www.apomio.de']
name = "apomio"
def start_requests(self):
self.i = 0
self.PZN_arr = []
self.product_names = []
with open('PZN.csv') as csv_file:
reader = csv.reader(csv_file)
headings = next(reader) # to skip the header of the csv
for elem in reader:
self.PZN_arr.append(elem[1])
self.product_names.append(elem[2])
start_urls = ['https://www.apomio.de/suche?query=' + x for x in self.PZN_arr]
for url in start_urls:
yield scrapy.Request(url, self.parse)
def parse(self, response):
soup = bs4.BeautifulSoup(response.text, 'lxml')
price_arr = []
names_arr = []
prices = soup.find_all("span", {"class": "block text-xs text-black font-medium"})
names = soup.find_all("span", {"class": "w-5/6 block text-xs text-black-darker mb-2"})
for name in names:
name_res = bs4.BeautifulSoup(str(name)).find(text=True)
temp = name_res.replace("\n", "").split(" ")
new_list = []
for elem in temp:
if elem != '':
new_list.append(elem)
names_arr.append(" ".join(new_list))
for price in prices:
prices_regex = re.compile(r'(Gesamtkosten)([ ])([0-9]+)([,])([0-9]+)')
result_price = ".".join(prices_regex.search(str(price)).group(3, 5))
price_arr.append(float(result_price))
if price_arr and names_arr:
logging.info(f'Parsing prices of {self.product_names[self.i]}\n PZN: {self.PZN_arr[self.i]} on {datetime.now()}\n')
self.i += 1
for price, name in zip(price_arr, names_arr):
shop = "Shop Name: " + name
print(f"{shop.ljust(75, ' ')} Price: {price:.2f} €")
else:
logging.info(f'Item could not be found at {response.url}')
所以我一直在抓取的网站叫做 apomio.de,我正在使用 PZN(医疗产品标识符来抓取它)。任何帮助将不胜感激。
编辑:所以基本上我想抓取这个元素内的数据-url
Pic of the Element。数据url在这个post.
最上面的代码中
首先你需要告诉 Scrapy 使用不同的函数处理你 start_urls
:
def start_requests(self):
self.i = 0
self.PZN_arr = []
self.product_names = []
with open('PZN.csv') as csv_file:
reader = csv.reader(csv_file)
headings = next(reader) # to skip the header of the csv
for elem in reader:
self.PZN_arr.append(elem[1])
self.product_names.append(elem[2])
start_urls = ['https://www.apomio.de/suche?query=' + x for x in self.PZN_arr]
for url in start_urls:
yield scrapy.Request(url, self.parse_start_urls)
接下来你需要为你的 data-urls
编写解析逻辑(我不确定你想在这里处理什么 url):
def parse_start_urls(self, response):
for data_url in response.xpath('//*[@data-url]/@data-url').getall():
yield scrapy.Request(url=data_url, callback=self.parse)
所以我只想让 Scrapy 抓取一个在 url 内部找到的 dat-aurl。这个数据- url 是具体的。 因为 start_urls 本身只包含部分列表,而此数据 urls 包含我需要抓取的所有内容。
<div id="loadMoreShops" class="border-b border-grey-bright py-4 text-center cursor-pointer text-sm text-blue font-semibold underline" data-url="https://www.apomio.de/preisvergleich-zeige-alle-angebote/171865">
Mehr Shops anzeigen
</div>
我当前的代码。它确实已经抓取了但不仅仅是我想要的一切:
import scrapy, bs4, re, csv, logging
from datetime import datetime
class ApomioSpider(scrapy.Spider):
allowed_domains = ['https://www.apomio.de']
name = "apomio"
def start_requests(self):
self.i = 0
self.PZN_arr = []
self.product_names = []
with open('PZN.csv') as csv_file:
reader = csv.reader(csv_file)
headings = next(reader) # to skip the header of the csv
for elem in reader:
self.PZN_arr.append(elem[1])
self.product_names.append(elem[2])
start_urls = ['https://www.apomio.de/suche?query=' + x for x in self.PZN_arr]
for url in start_urls:
yield scrapy.Request(url, self.parse)
def parse(self, response):
soup = bs4.BeautifulSoup(response.text, 'lxml')
price_arr = []
names_arr = []
prices = soup.find_all("span", {"class": "block text-xs text-black font-medium"})
names = soup.find_all("span", {"class": "w-5/6 block text-xs text-black-darker mb-2"})
for name in names:
name_res = bs4.BeautifulSoup(str(name)).find(text=True)
temp = name_res.replace("\n", "").split(" ")
new_list = []
for elem in temp:
if elem != '':
new_list.append(elem)
names_arr.append(" ".join(new_list))
for price in prices:
prices_regex = re.compile(r'(Gesamtkosten)([ ])([0-9]+)([,])([0-9]+)')
result_price = ".".join(prices_regex.search(str(price)).group(3, 5))
price_arr.append(float(result_price))
if price_arr and names_arr:
logging.info(f'Parsing prices of {self.product_names[self.i]}\n PZN: {self.PZN_arr[self.i]} on {datetime.now()}\n')
self.i += 1
for price, name in zip(price_arr, names_arr):
shop = "Shop Name: " + name
print(f"{shop.ljust(75, ' ')} Price: {price:.2f} €")
else:
logging.info(f'Item could not be found at {response.url}')
所以我一直在抓取的网站叫做 apomio.de,我正在使用 PZN(医疗产品标识符来抓取它)。任何帮助将不胜感激。
编辑:所以基本上我想抓取这个元素内的数据-url Pic of the Element。数据url在这个post.
最上面的代码中首先你需要告诉 Scrapy 使用不同的函数处理你 start_urls
:
def start_requests(self):
self.i = 0
self.PZN_arr = []
self.product_names = []
with open('PZN.csv') as csv_file:
reader = csv.reader(csv_file)
headings = next(reader) # to skip the header of the csv
for elem in reader:
self.PZN_arr.append(elem[1])
self.product_names.append(elem[2])
start_urls = ['https://www.apomio.de/suche?query=' + x for x in self.PZN_arr]
for url in start_urls:
yield scrapy.Request(url, self.parse_start_urls)
接下来你需要为你的 data-urls
编写解析逻辑(我不确定你想在这里处理什么 url):
def parse_start_urls(self, response):
for data_url in response.xpath('//*[@data-url]/@data-url').getall():
yield scrapy.Request(url=data_url, callback=self.parse)