如何从 Scrapy 结果中获取字典

How to get a dictionary from Scrapy results

我需要从 Scrapy 结果中获取 python 个词典列表。

HTML/JS 代码包含此部分:

window.Matches =  [
    {
        matchId: 402404,
        leagueId: 1087,
        leagueName: "אלוף האלופים",
        leagueURL: "https://www.one.co.il/Soccer/league/1087",
        isLeagueLinkable: true,
        round: 1,
        roundName: "- משחק 1",
        date: new Date("2020-08-08T20:15:00"),
        isTimeSeted: true,
        isHaveScore:true,
        dateDay: "שבת",
        homeId: 22,
        homeName: "הפועל ב\"ש",
        homeURL: "https://www.one.co.il/Soccer/team/22",
        homeScore: 0,
        guestId: 3,
        guestName: "מכבי ת\"א",
        guestURL: "https://www.one.co.il/Soccer/team/3",
        guestScore: 2,
        arenaURL: "https://www.one.co.il/Article/20-21/1,1087,3,0/364600.html"
    },{
        matchId: 402405,
        leagueId: 1087,
        leagueName: "אלוף האלופים",
        leagueURL: "https://www.one.co.il/Soccer/league/1087",
        isLeagueLinkable: true,
        round: 2,
        roundName: "- גומלין",
        date: new Date("2020-08-13T20:30:00"),
        isTimeSeted: true,
        isHaveScore:true,
        dateDay: "חמישי",
        homeId: 3,
        homeName: "מכבי ת\"א",
        homeURL: "https://www.one.co.il/Soccer/team/3",
        homeScore: 2,
        guestId: 22,
        guestName: "הפועל ב\"ש",
        guestURL: "https://www.one.co.il/Soccer/team/22",
        guestScore: 0,
        arenaURL: "https://www.one.co.il/Article/20-21/1,1087,3,0/364952.html"
    }];

我试过如下:

import scrapy
import json

class One(scrapy.Spider):
    name = "one"

    start_urls = [
        "https://www.one.co.il/Soccer/team/3/"
    ]

    def parse(self, response):
        resp = response.xpath('/html/head/script[3]/text()').extract_first()
        yield{
            'game': resp
        }

但结果看起来像一个不可读的大字符串,如下所示:

[
{"game": "\r\n    window.Matches =  [\r\n        {\r\n            matchId: 402404,\r\n            leagueId: 1087,\r\n            leagueName: \"\u05d0\u05dc\u05d5\u05e3 \u05d4\u05d0\u05dc\u05d5\u05e4\u05d9\u05dd\",\r\n            leagueURL: \"https://www.one.co.il/Soccer/league/1087\",\r\n            isLeagueLinkable: true,\r\n            round: 1,\r\n            roundName: \"- \u05de\u05e9\u05d7\u05e7 1\",\r\n            date: new Date(\"2020-08-08T20:15:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e9\u05d1\u05ea\",\r\n            homeId: 22,\r\n            homeName: \"\u05d4\u05e4\u05d5\u05e2\u05dc \u05d1\\"\u05e9\",\r\n            homeURL: \"https://www.one.co.il/Soccer/team/22\",\r\n            homeScore: 0,\r\n            guestId: 3,\r\n            guestName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            guestURL: \"https://www.one.co.il/Soccer/team/3\",\r\n            guestScore: 2,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,1087,3,0/364600.html\"\r\n        },{\r\n            matchId: 402405,\r\n            leagueId: 1087,\r\n            leagueName: \"\u05d0\u05dc\u05d5\u05e3 \u05d4\u05d0\u05dc\u05d5\u05e4\u05d9\u05dd\",\r\n            leagueURL: \"https://www.one.co.il/Soccer/league/1087\",\r\n            isLeagueLinkable: true,\r\n            round: 2,\r\n            roundName: \"- \u05d2\u05d5\u05de\u05dc\u05d9\u05df\",\r\n            date: new Date(\"2020-08-13T20:30:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05d7\u05de\u05d9\u05e9\u05d9\",\r\n            homeId: 3,\r\n            homeName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            homeURL: \"https://www.one.co.il/Soccer/team/3\",\r\n            homeScore: 2,\r\n            guestId: 22,\r\n            guestName: \"\u05d4\u05e4\u05d5\u05e2\u05dc \u05d1\\"\u05e9\",\r\n            guestURL: \"https://www.one.co.il/Soccer/team/22\",\r\n            guestScore: 0,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,1087,3,0/364952.html\"\r\n        },{\r\n            matchId: 405477,\r\n            leagueId: 22,\r\n            leagueName: \"\u05de\u05d5\u05e7\u05d3\u05de\u05d5\u05ea \u05dc\u05d9\u05d2\u05ea \u05d4\u05d0\u05dc\u05d5\u05e4\u05d5\u05ea\",\r\n            leagueURL: \"\",\r\n            isLeagueLinkable: false,\r\n            round: 1,\r\n            roundName: \"- \u05e1\u05d9\u05d1\u05d5\u05d1 \u05e8\u05d0\u05e9\u05d5\u05df\",\r\n            date: new Date(\"2020-08-19T20:00:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e8\u05d1\u05d9\u05e2\u05d9\",\r\n            homeId: 3,\r\n            homeName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            homeURL: \"\",\r\n            homeScore: 2,\r\n            guestId: 5590,\r\n            guestName: \"\u05e8\u05d9\u05d2\u05d4\",\r\n            guestURL: \"\",\r\n            guestScore: 0,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,1,3,63766/365369.html\"\r\n        },{\r\n            matchId: 406083,\r\n            leagueId: 667,\r\n            leagueName: \"\u05d2\u05d1\u05d9\u05e2 \u05d4\u05d8\u05d5\u05d8\u05d5\",\r\n            leagueURL: \"https://www.one.co.il/Soccer/league/667\",\r\n            isLeagueLinkable: true,\r\n            round: 4,\r\n            roundName: \"- \u05d2\u05de\u05e8\",\r\n            date: new Date(\"2020-08-22T20:30:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e9\u05d1\u05ea\",\r\n            homeId: 17,\r\n            homeName: \"\u05d1\u05e0\u05d9 \u05e1\u05db\u05e0\u05d9\u05df\",\r\n            homeURL: \"https://www.one.co.il/Soccer/team/17\",\r\n            homeScore: 0,\r\n            guestId: 3,\r\n            guestName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            guestURL: \"https://www.one.co.il/Soccer/team/3\",\r\n            guestScore: 2,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,667,3,0/365537.html\"\r\n        },{\r\n            matchId: 406275,\r\n            leagueId: 22,\r\n            leagueName: \"\u05de\u05d5\u05e7\u05d3\u05de\u05d5\u05ea \u05dc\u05d9\u05d2\u05ea \u05d4\u05d0\u05dc\u05d5\u05e4\u05d5\u05ea\",\r\n            leagueURL: \"\",\r\n            isLeagueLinkable: false,\r\n            round: 2,\r\n            roundName: \"\u05e1\u05d9\u05d1\u05d5\u05d1 \u05e9\u05e0\u05d9\",\r\n            date: new Date(\"2020-08-26T19:00:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e8\u05d1\u05d9\u05e2\u05d9\",\r\n            homeId: 5415,\r\n            homeName: \"\u05e1\u05d5\u05d3\u05d5\u05d1\u05d4\",\r\n            homeURL: \"\",\r\n            homeScore: 0,\r\n            guestId: 3,\r\n            guestName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            guestURL: \"\",\r\n            guestScore: 3,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,22,3,0/365806.html\"\r\n        },{\r\n            matchId: 402498,\r\n            leagueId: 1,\r\n            leagueName: \"\u05dc\u05d9\u05d2\u05ea \u05d4\u05e2\u05dc\",\r\n            leagueURL: \"https://www.one.co.il/Soccer/league/1\",\r\n            isLeagueLinkable: true,\r\n            round: 1,\r\n            roundName: \"\u05de\u05d7\u05d6\u05d5\u05e8 1\",\r\n            date: new Date(\"2020-08-30T21:00:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e8\u05d0\u05e9\u05d5\u05df\",\r\n            homeId: 3,\r\n            homeName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            homeURL: \"https://www.one.co.il/Soccer/team/3\",\r\n            homeScore: 1,\r\n            guestId: 10,\r\n            guestName: \"\u05de\u05db\u05d1\u05d9 \u05e4\\"\u05ea\",\r\n            guestURL: \"https://www.one.co.il/Soccer/team/10\",\r\n            guestScore: 2,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,1,1,0/366079.html\"\r\n        },{\r\n            matchId: 402504,\r\n            leagueId: 1,\r\n            leagueName: \"\u05dc\u05d9\u05d2\u05ea \u05d4\u05e2\u05dc\",\r\n            leagueURL: \"https://www.one.co.il/Soccer/league/1\",\r\n            isLeagueLinkable: true,\r\n            round: 2,\r\n            roundName: \"\u05de\u05d7\u05d6\u05d5\u05e8 2\",\r\n            date: new Date(\"2020-09-12T20:30:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e9\u05d1\u05ea\",\r\n            homeId: 11,\r\n            homeName: \"\u05d1\u05e0\u05d9 \u05d9\u05d4\u05d5\u05d3\u05d4\",\r\n            homeURL: \"https://www.one.co.il/Soccer/team/11\",\r\n            homeScore: 2,\r\n            guestId: 3,\r\n            guestName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            guestURL: \"https://www.one.co.il/Soccer/team/3\",\r\n            guestScore: 2,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,1,3,0/366927.html\"\r\n        },{\r\n            matchId: 408093,\r\n            leagueId: 22,\r\n            leagueName: \"\u05de\u05d5\u05e7\u05d3\u05de\u05d5\u05ea \u05dc\u05d9\u05d2\u05ea \u05d4\u05d0\u05dc\u05d5\u05e4\u05d5\u05ea\",\r\n            leagueURL: \"\",\r\n            isLeagueLinkable: false,\r\n            round: 3,\r\n            roundName: \"- \u05e1\u05d9\u05d1\u05d5\u05d1 \u05e9\u05dc\u05d9\u05e9\u05d9\",\r\n            date: new Date(\"2020-09-16T20:00:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e8\u05d1\u05d9\u05e2\u05d9\",\r\n            homeId: 3,\r\n            homeName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            homeURL: \"\",\r\n            homeScore: 1,\r\n            guestId: 5580,\r\n            guestName: \"\u05d3\u05d9\u05e0\u05de\u05d5 \u05d1\u05e8\u05e1\u05d8\",\r\n            guestURL: \"\",\r\n            guestScore: 0,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,1,3,0/367232.html\"\r\n        },{\r\n            matchId: 409037,\r\n            leagueId: 22,\r\n            leagueName: \"\u05de\u05d5\u05e7\u05d3\u05de\u05d5\u05ea \u05dc\u05d9\u05d2\u05ea \u05d4\u05d0\u05dc\u05d5\u05e4\u05d5\u05ea\",\r\n            leagueURL: \"\",\r\n            isLeagueLinkable: false,\r\n            round: 4,\r\n            roundName: \"- \u05e4\u05dc\u05d9\u05d9\u05d0\u05d5\u05e3 \u05de\u05e9\u05d7\u05e7 1\",\r\n            date: new Date(\"2020-09-22T22:00:00\"),\r\n            isTimeSeted: true,\r\n            isHaveScore:true,\r\n            dateDay: \"\u05e9\u05dc\u05d9\u05e9\u05d9\",\r\n            homeId: 3,\r\n            homeName: \"\u05de\u05db\u05d1\u05d9 \u05ea\\"\u05d0\",\r\n            homeURL: \"\",\r\n            homeScore: 1,\r\n            guestId: 5120,\r\n            guestName: \"\u05e8\u05d3 \u05d1\u05d5\u05dc \u05d6\u05dc\u05e6\u05d1\u05d5\u05e8\u05d2\",\r\n            guestURL: \"\",\r\n            guestScore: 2,\r\n            arenaURL: \"https://www.one.co.il/Article/20-21/1,22,3,0/367637.html\"

任何人都可以告诉我如何从这些细节中使用 scrapy 制作一个有效的 python 词典列表吗?:

[{
    matchId: 402404,
    leagueId: 1087,
    leagueName: "אלוף האלופים",
    leagueURL: "https://www.one.co.il/Soccer/league/1087",
    isLeagueLinkable: true,
    round: 1,
    roundName: "- משחק 1",
    date: new Date("2020-08-08T20:15:00"),
    isTimeSeted: true,
    isHaveScore:true,
    dateDay: "שבת",
    homeId: 22,
    homeName: "הפועל ב\"ש",
    homeURL: "https://www.one.co.il/Soccer/team/22",
    homeScore: 0,
    guestId: 3,
    guestName: "מכבי ת\"א",
    guestURL: "https://www.one.co.il/Soccer/team/3",
    guestScore: 2,
    arenaURL: "https://www.one.co.il/Article/20-21/1,1087,3,0/364600.html"
},{
    matchId: 402405,
    leagueId: 1087,
    leagueName: "אלוף האלופים",
    leagueURL: "https://www.one.co.il/Soccer/league/1087",
    isLeagueLinkable: true,
    round: 2,
    roundName: "- גומלין",
    date: new Date("2020-08-13T20:30:00"),
    isTimeSeted: true,
    isHaveScore:true,
    dateDay: "חמישי",
    homeId: 3,
    homeName: "מכבי ת\"א",
    homeURL: "https://www.one.co.il/Soccer/team/3",
    homeScore: 2,
    guestId: 22,
    guestName: "הפועל ב\"ש",
    guestURL: "https://www.one.co.il/Soccer/team/22",
    guestScore: 0,
    arenaURL: "https://www.one.co.il/Article/20-21/1,1087,3,0/364952.html"
}];

提前致谢

您已经完成了大部分工作。您需要做的就是使用正则表达式获取确切变量的值,然后因为它是 javascript 您不能使用 json.loads() 目录。相反,您需要将其评估为 js。

这里是应该可以工作的代码的略微修改版本:

script = response.xpath('/html/head/script[3]/text()').re_first('(?s).*?window.Matches.*?(\[.*\]);')

这应该会为您提供变量的准确值。现在你只需要评估它。为此,我将使用 js2py 库。

context = js2py.EvalJs({})
context.execute(f'var matches = {script}'}
result = context.to_dict()
yield {'game': result['matches']}

附带说明一下,在 xpaths 中使用索引并不是一个好的尝试。 Contains 会更安全。