无法抓取所有项目

Unable to scrape all of the items

在 selenium 和 scrapy 的帮助下,我只获得了 487 项中的 12 项。如何刮掉所有物品。我无法确定我在这里出了什么问题。感谢任何人的帮助。

URL

我的代码:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from selenium_stealth import stealth
from time import sleep


class CpcuSpider(CrawlSpider):
    name = 'cp'
    allowed_domains = ['www.arp.fr']
    start_urls = [
        'https://www.arp.fr/produits-portables-tablettes-ordinateurs-portables/?queryString=JTdCJTIyYXJlYUlkJTIyJTNBJTIyMkVEODhGMjctOTNFOS00NzQzLUI3NDYtRUNFQUJENUZFRDA4JTIyJTJDJTIyaXNRdWVyeSUyMiUzQWZhbHNlJTJDJTIyc29ydEF0dHJpYnV0ZSUyMiUzQW51bGwlMkMlMjJzb3J0RGlyZWN0aW9uJTIyJTNBbnVsbCUyQyUyMnBhZ2VubyUyMiUzQSUyMjElMjIlMkMlMjJwZXJQYWdlJTIyJTNBJTIyMTIlMjIlMkMlMjJ2YWx1ZXMlMjIlM0ElNUIlNUQlMkMlMjJwcm9kdWN0SWRzJTIyJTNBJTVCJTVEJTJDJTIycGFydG5lcklkJTIyJTNBbnVsbCUyQyUyMm9wdGlvbnMlMjIlM0ElNUJudWxsJTJDbnVsbCUyQ251bGwlNUQlN0Q=&page='+str(x)+'&productfilter=&sort=null' for x in range(1,6)]

    rules = (
        Rule(LinkExtractor(restrict_xpaths='//a[@class="rasEpicTitle rasElementReaction"]'), callback='parse_item', follow=False),
        #Rule(LinkExtractor(restrict_xpaths='//*[@class="fielddata"]/a'), callback='parse_item', follow=True),
    )

    def __init__(self):
        # this page loads
        CrawlSpider.__init__(self)
        chrome_path = which("chromedriver")
        self.driver = webdriver.Chrome(executable_path=chrome_path)
        
        print(dir(self.driver))
        self.driver.maximize_window()
        # self.driver.quit()

    def parse_item(self, response):
        self.driver.get(response.url)
        sleep(5)

        title = Selector(text=self.driver.page_source)
        #for list_node in lists.xpath('//*[@class="rasEpicBoxContainer"]'):
            
        yield{
            'Title': title.xpath('//*[@title="028001007"]/text()').get()
        }
        #self.driver.close()
    
        
    
    

start_urls 开始,您的代码中有很多错误。如果您查看网站,您会发现分页不适用于 URL。例如,您无法使用 https://www.arp.fr/produits-portables-tablettes-ordinateurs-portables/?queryString=JTdCJTIyYXJlYUlkJTIyJTNBJTIyMkVEODhGMjctOTNFOS00NzQzLUI3NDYtRUNFQUJENUZFRDA4JTIyJTJDJTIyaXNRdWVyeSUyMiUzQWZhbHNlJTJDJTIyc29ydEF0dHJpYnV0ZSUyMiUzQW51bGwlMkMlMjJzb3J0RGlyZWN0aW9uJTIyJTNBbnVsbCUyQyUyMnBhZ2VubyUyMiUzQSUyMjElMjIlMkMlMjJwZXJQYWdlJTIyJTNBJTIyMTIlMjIlMkMlMjJ2YWx1ZXMlMjIlM0ElNUIlNUQlMkMlMjJwcm9kdWN0SWRzJTIyJTNBJTVCJTVEJTJDJTIycGFydG5lcklkJTIyJTNBbnVsbCUyQyUyMm9wdGlvbnMlMjIlM0ElNUJudWxsJTJDbnVsbCUyQ251bGwlNUQlN0Q=&page=3&productfilter=&sort=null 加载第三页。您将看到第一页。

我建议使用另一种方法:在您的 Scrapy 蜘蛛中模拟 Javascript 调用。内部网站使用特殊 URL 的调用来接收 JSON 然后将其显示给您。我们可以尝试执行相同的操作:

import scrapy
import json
import base64
import urllib
from scrapy.http import HtmlResponse # to update response from a string
import chompjs # to parse Javascript object


def generate_query_string(query):
    # Website send pagination and a query string using special HTTP header
    # This header is Base64 encoded and URL encoded
    query_string_raw = json.dumps(query)
    query_string_urlencoded = urllib.parse.quote_plus(query_string_raw)
    query_string = base64.b64encode(query_string_urlencoded.encode('ascii')).decode('ascii')
    return query_string

class ArpSpider(scrapy.Spider):
    name = '68943284'
    # I got above query params from your URL using online Base64 decoder and next online URL encoder
    # Most interesting that we can set 500 results per page and get everything in a SINGLE call!
    query = {
        "areaId": "2ED88F27-93E9-4743-B746-ECEABD5FED08", 
        "isQuery": False, 
        "sortAttribute": None, 
        "sortDirection": None, 
        "pageno": "1", 
        "perPage": "500", 
        "values": [], 
        "productIds": ["5267337-05", "5393345-05", "5400545-05", "5400812-05", "5404575-05", "5409557-05", "5410466-05", "5412282-05", "5412314-05", "5412318-05", "5412323-05", "5421276-05"],
        "partnerId": None, 
        "options": [None, None, None]
    }

    def start_requests(self):
        yield scrapy.Request(
            url='https://www.arp.fr/filter/page.json',
            headers={
                'queryString': generate_query_string(self.query),
            },
            callback=self.parse
        )
    
    def parse(self, response):
        # with open('Samples/Arp.json', 'wb') as f:
        #    f.write(response.body)
        # We need to parse JSON response and get HTML code from it
        data = json.loads(response.text)
        # print(data['products'])
        response = HtmlResponse(url="My URL", body=data['products'], encoding='utf-8')
        # Now we need to parse HTML and get Javascript object with all data we need
        javascript = response.xpath('//script[contains(., "dataLayer.push")]/text()').re_first(r'dataLayer\.push\(([\s\S]+?)\);')
        if javascript:
            data = chompjs.parse_js_object(javascript)
            for item in data['ecommerce']['impressions']:
                name = item['name']
                price = item['price']
                print(item)