Scrapy for eCommerce with Selenium Infinite Scroll,帮助返回值

Scrapy for eCommerce with Selenium Infinite Scroll, Help Returning Values

我对编程比较陌生。我自己做了一些小项目,并开始使用 Scrapy 制作网页抓取工具。我正在尝试为 Home Depot 制作一个抓取器并遇到 运行 问题。这试图解决的问题是 Home Depot 网页有 javascript 只有当你向下滚动页面时才会加载,所以我添加了一些我发现向下滚动页面以显示所有产品的代码,这样它就可以获取每个产品板块的标题、评论数和价格。在添加这段代码之前,它确实正确地抓取了产品信息;添加它之后,我最初遇到的问题是代码只抓取了结果的最后一页,所以我移动了一些东西。我认为作为新手我只是不了解 Scrapy 中的 objects 以及信息是如何传递的,特别是 HTML 我正在尝试将其设置为 return 中的值 parse_product。到目前为止,这确实打开了页面并转到下一页,但它不再抓取任何产品。我哪里错了?我已经为此苦苦挣扎了几个小时,我正在 class 进行网络抓取,虽然我取得了一些成功,但似乎如果我必须做任何稍微偏离路线的事情,那将是一场巨大的斗争。

import scrapy
import logging
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime

class HdSpider(scrapy.Spider):
    name = 'hd'
    allowed_domains = ['www.homedepot.com']
    start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category


    def parse(self, response):
        options = Options()
        chrome_path = which("chromedriver")
        driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
        p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
        start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='

        while p < 25:
            driver.get(start_url + str(p))
            driver.set_window_size(1920, 1080)
            #sleep(2)
            scroll_pause_time = 1 
            screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
            i = 1

            while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
                driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
                i += 1
                sleep(scroll_pause_time)
                scroll_height = driver.execute_script("return document.body.scrollHeight;")  
                if (screen_height) * i > scroll_height:
                    break
                self.html = driver.page_source    
            p = p + 24            

    def parse_product(self, response):
        resp = Selector(text=self.html)
        for products in resp.xpath("//div[@class='product-pod--padding']"):
            date = datetime.now().strftime("%m-%d-%y")
            brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get() 
            title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get() 
            link = products.xpath(".//div//a//@href").get() 
            model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
            review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
            price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
            yield {
               'Date scraped' : date,
               'Brand' : brand,
               'Title' : title,
               'Product Link' : "https://www.homedepot.com" + remove_tags(link),
               'Price' : "$" + price,
               'Model #' : model,
               'Review Count' : review_count
            }

我没看到你 运行 parse_product 在哪里。它不会自动为您执行。除了像 parse_productresponse 这样的功能之外,更确切地说是在某些 yield Requests(supage_url, parse_product) 中使用它来解析来自子页面的数据,而不是来自您在 parse 中获得的页面的数据。您应该像这样将代码从 parse_product 移动到 parse

def parse(self, response):
    options = Options()
    chrome_path = which("chromedriver")
    driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
    driver.set_window_size(1920, 1080)

    p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page

    start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='

    scroll_pause_time = 1 
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web

    while p < 25:
        driver.get(start_url + str(p))

        #sleep(2)
        i = 1

        # scrolling
        while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
            driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
            i += 1
            sleep(scroll_pause_time)
            scroll_height = driver.execute_script("return document.body.scrollHeight;")  
            if (screen_height) * i > scroll_height:
                break

        # after scrolling
        self.html = driver.page_source    
        p = p + 24            
        resp = Selector(text=self.html)
        for products in resp.xpath("//div[@class='product-pod--padding']"):
            date = datetime.now().strftime("%m-%d-%y")
            brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get() 
            title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get() 
            link = products.xpath(".//div//a//@href").get() 
            model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
            review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
            price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
            yield {
               'Date scraped' : date,
               'Brand' : brand,
               'Title' : title,
               'Product Link' : "https://www.homedepot.com" + remove_tags(link),
               'Price' : "$" + price,
               'Model #' : model,
               'Review Count' : review_count
            }

但我会做其他更改 - 您使用 p = p + 24 但是当我在浏览器中检查页面时,我发现我需要 p = p + 48 才能获得所有产品。我宁愿使用 Selenium 来单击按钮 > 以获取下一页,而不是 p = p + ...


编辑:

我的版本有其他更改。

每个人都可以 运行 它而无需创建项目。

#!/usr/bin/env python3

import scrapy
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime

class HdSpider(scrapy.Spider):

    name = 'hd'

    allowed_domains = ['www.homedepot.com']
    start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category

    def parse(self, response):
    
        options = Options()
        
        chrome_path = which("chromedriver")
        driver = webdriver.Chrome(executable_path=chrome_path) #, chrome_options=options)
        #driver.set_window_size(1920, 1080)
        print(dir(driver))
        driver.maximize_window()
        
        scroll_pause_time = 1 

        # loading first page
        start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao=0'
        driver.get(start_url)

        screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web

        #while True:        # all pages
        for _ in range(5):  # only 5 pages
        
            #sleep(scroll_pause_time)
            
            # scrolling page
            i = 1
            while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
                driver.execute_script(f"window.scrollBy(0, {screen_height});")  
                sleep(scroll_pause_time)
                
                i += 1

                scroll_height = driver.execute_script("return document.body.scrollHeight;")  
                if screen_height * i > scroll_height:
                    break
        
            # after scrolling    
            resp = Selector(text=driver.page_source)
            
            for products in resp.xpath("//div[@class='product-pod--padding']"):
                date = datetime.now().strftime("%m-%d-%y")
                brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get() 
                title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get() 
                link = products.xpath(".//div//a//@href").get() 
                model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
                review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
                price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
                yield {
                   'Date scraped' : date,
                   'Brand' : brand,
                   'Title' : title,
                   'Product Link' : "https://www.homedepot.com" + remove_tags(link),
                   'Price' : "$" + price,
                   'Model #' : model,
                   'Review Count' : review_count
                }
            
            # click button `>` to load next page
            try:
                driver.find_element_by_xpath('//a[@aria-label="Next"]').click()
            except:
                break
                    
            
# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
})

c.crawl(HdSpider)
c.start()