Scrapy for eCommerce with Selenium Infinite Scroll,帮助返回值
Scrapy for eCommerce with Selenium Infinite Scroll, Help Returning Values
我对编程比较陌生。我自己做了一些小项目,并开始使用 Scrapy 制作网页抓取工具。我正在尝试为 Home Depot 制作一个抓取器并遇到 运行 问题。这试图解决的问题是 Home Depot 网页有 javascript 只有当你向下滚动页面时才会加载,所以我添加了一些我发现向下滚动页面以显示所有产品的代码,这样它就可以获取每个产品板块的标题、评论数和价格。在添加这段代码之前,它确实正确地抓取了产品信息;添加它之后,我最初遇到的问题是代码只抓取了结果的最后一页,所以我移动了一些东西。我认为作为新手我只是不了解 Scrapy 中的 objects 以及信息是如何传递的,特别是 HTML 我正在尝试将其设置为 return 中的值 parse_product。到目前为止,这确实打开了页面并转到下一页,但它不再抓取任何产品。我哪里错了?我已经为此苦苦挣扎了几个小时,我正在 class 进行网络抓取,虽然我取得了一些成功,但似乎如果我必须做任何稍微偏离路线的事情,那将是一场巨大的斗争。
import scrapy
import logging
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime
class HdSpider(scrapy.Spider):
name = 'hd'
allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='
while p < 25:
driver.get(start_url + str(p))
driver.set_window_size(1920, 1080)
#sleep(2)
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break
self.html = driver.page_source
p = p + 24
def parse_product(self, response):
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
我没看到你 运行 parse_product
在哪里。它不会自动为您执行。除了像 parse_product
和 response
这样的功能之外,更确切地说是在某些 yield Requests(supage_url, parse_product)
中使用它来解析来自子页面的数据,而不是来自您在 parse
中获得的页面的数据。您应该像这样将代码从 parse_product
移动到 parse
:
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
driver.set_window_size(1920, 1080)
p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
while p < 25:
driver.get(start_url + str(p))
#sleep(2)
i = 1
# scrolling
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break
# after scrolling
self.html = driver.page_source
p = p + 24
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
但我会做其他更改 - 您使用 p = p + 24
但是当我在浏览器中检查页面时,我发现我需要 p = p + 48
才能获得所有产品。我宁愿使用 Selenium
来单击按钮 >
以获取下一页,而不是 p = p + ...
。
编辑:
我的版本有其他更改。
每个人都可以 运行 它而无需创建项目。
#!/usr/bin/env python3
import scrapy
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime
class HdSpider(scrapy.Spider):
name = 'hd'
allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path) #, chrome_options=options)
#driver.set_window_size(1920, 1080)
print(dir(driver))
driver.maximize_window()
scroll_pause_time = 1
# loading first page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao=0'
driver.get(start_url)
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
#while True: # all pages
for _ in range(5): # only 5 pages
#sleep(scroll_pause_time)
# scrolling page
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script(f"window.scrollBy(0, {screen_height});")
sleep(scroll_pause_time)
i += 1
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if screen_height * i > scroll_height:
break
# after scrolling
resp = Selector(text=driver.page_source)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
# click button `>` to load next page
try:
driver.find_element_by_xpath('//a[@aria-label="Next"]').click()
except:
break
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(HdSpider)
c.start()
我对编程比较陌生。我自己做了一些小项目,并开始使用 Scrapy 制作网页抓取工具。我正在尝试为 Home Depot 制作一个抓取器并遇到 运行 问题。这试图解决的问题是 Home Depot 网页有 javascript 只有当你向下滚动页面时才会加载,所以我添加了一些我发现向下滚动页面以显示所有产品的代码,这样它就可以获取每个产品板块的标题、评论数和价格。在添加这段代码之前,它确实正确地抓取了产品信息;添加它之后,我最初遇到的问题是代码只抓取了结果的最后一页,所以我移动了一些东西。我认为作为新手我只是不了解 Scrapy 中的 objects 以及信息是如何传递的,特别是 HTML 我正在尝试将其设置为 return 中的值 parse_product。到目前为止,这确实打开了页面并转到下一页,但它不再抓取任何产品。我哪里错了?我已经为此苦苦挣扎了几个小时,我正在 class 进行网络抓取,虽然我取得了一些成功,但似乎如果我必须做任何稍微偏离路线的事情,那将是一场巨大的斗争。
import scrapy
import logging
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime
class HdSpider(scrapy.Spider):
name = 'hd'
allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='
while p < 25:
driver.get(start_url + str(p))
driver.set_window_size(1920, 1080)
#sleep(2)
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break
self.html = driver.page_source
p = p + 24
def parse_product(self, response):
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
我没看到你 运行 parse_product
在哪里。它不会自动为您执行。除了像 parse_product
和 response
这样的功能之外,更确切地说是在某些 yield Requests(supage_url, parse_product)
中使用它来解析来自子页面的数据,而不是来自您在 parse
中获得的页面的数据。您应该像这样将代码从 parse_product
移动到 parse
:
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
driver.set_window_size(1920, 1080)
p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
while p < 25:
driver.get(start_url + str(p))
#sleep(2)
i = 1
# scrolling
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break
# after scrolling
self.html = driver.page_source
p = p + 24
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
但我会做其他更改 - 您使用 p = p + 24
但是当我在浏览器中检查页面时,我发现我需要 p = p + 48
才能获得所有产品。我宁愿使用 Selenium
来单击按钮 >
以获取下一页,而不是 p = p + ...
。
编辑:
我的版本有其他更改。
每个人都可以 运行 它而无需创建项目。
#!/usr/bin/env python3
import scrapy
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime
class HdSpider(scrapy.Spider):
name = 'hd'
allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path) #, chrome_options=options)
#driver.set_window_size(1920, 1080)
print(dir(driver))
driver.maximize_window()
scroll_pause_time = 1
# loading first page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao=0'
driver.get(start_url)
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
#while True: # all pages
for _ in range(5): # only 5 pages
#sleep(scroll_pause_time)
# scrolling page
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script(f"window.scrollBy(0, {screen_height});")
sleep(scroll_pause_time)
i += 1
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if screen_height * i > scroll_height:
break
# after scrolling
resp = Selector(text=driver.page_source)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
# click button `>` to load next page
try:
driver.find_element_by_xpath('//a[@aria-label="Next"]').click()
except:
break
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(HdSpider)
c.start()