如何避免在 Python 中使用 concurrent.futures 附加错误？

Question

我正在尝试使用 API 的信息创建一个 table。当我逐场分析游戏时，它运行良好，但当我尝试使用 "concurrent.futures" 分析大量游戏以加快进程时，它会向 table 附加一些错误信息。每次我运行脚本错误出现在不同的行中，这是偶然的。

此外，我注意到打印 game_ids 分析后，脚本没有按顺序查看它们。也许错误就出在这个问题上。

我该如何解决这个问题？谢谢！

这是我正在使用的代码。

import requests as r
import pandas as pd
import concurrent.futures

pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

game_id = [100, 101, 102] #This is an example, I use a large number of games

d = {'game_id'        : [],
     'atbat_num'      : [],
     'play_index'     : [],
     'batter_id'      : [],
     'batter_name'    : [],
     'pitcher_id'     : [],
     'pitcher_name'   : [],
     'runner_id'      : [],
     'runner_name'    : [],
     'event'          : [],
     'start'          : [],
     'end'            : [],
     'movementReason' : []
        }

def get_url(gids):
    url = (f'http://examplelink.com/str(gids)}/')
    req  = r.get(url)
    json = req.json()

    for i in json['allPlays']:

        if 'runners' in i:

            for p in i['runners']:

                d['game_id'].append(gids)
                if 'atBatIndex' in i:
                    d['atbat_num'].append(i['atBatIndex'])
                else: d['atbat_num'].append(None)
                if 'playIndex' in p['details']:
                    d['play_index'].append(p['details']['playIndex'])
                else: d['play_index'].append(None)
                if 'matchup' in i:
                    if 'batter' in i['matchup']:
                        d['batter_id'].append(i['matchup']['batter']['id'])                    
                    else: d['batter_id'].append(None)
                else: d['batter_id'].append(None)
                if 'matchup' in i:
                    if 'batter' in i['matchup']:
                        d['batter_name'].append(i['matchup']['batter']['fullName'])
                    else: d['batter_name'].append(None)
                else: d['batter_name'].append(None)
                if 'matchup' in i:
                    if 'pitcher' in i['matchup']:
                        d['pitcher_id'].append(i['matchup']['pitcher']['id'])
                    else: d['pitcher_id'].append(None)
                else: d['pitcher_id'].append(None)
                if 'matchup' in i:
                    if 'pitcher' in i['matchup']:
                        d['pitcher_name'].append(i['matchup']['pitcher']['fullName'])
                    else: d['pitcher_name'].append(None)
                else: d['pitcher_name'].append(None)
                if 'details' in p:
                    if 'runner' in p['details']:
                        if 'id' in p['details']['runner']:
                            d['runner_id'].append(p['details']['runner']['id'])
                        else: d['runner_id'].append(None)
                    else: d['runner_id'].append(None)
                else: d['runner_id'].append(None)
                if 'details' in p:
                    if 'runner' in p['details']:
                        if 'fullName' in p['details']['runner']:
                            d['runner_name'].append(p['details']['runner']['fullName'])
                        else: d['runner_name'].append(None)
                    else: d['runner_name'].append(None)
                else: d['runner_name'].append(None)
                if 'details' in p:
                    d['event'].append(p['details']['event'])
                else: d['event'].append(None)
                if 'movement' in p:
                    d['start'].append(p['movement']['start'])
                else: d['start'].append(None)
                if 'movement' in p:
                    d['end'].append(p['movement']['end'])
                else: d['end'].append(None)
                if 'details' in p:
                    d['movementReason'].append(p['details']['movementReason'])
                else: d['movementReason'].append(None)

    print(f'Game {gids} analyzed')          

    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_url, game_id)

table = pd.DataFrame(d)

export_csv = table.to_csv ('runner.csv', index = None, header=True)

Answer 1

Executor.map 并发调用了 func，因此不能保证结果的排序是根据可迭代的。

我假设数据是以这种方式构建的，以便稍后可以使用 pandas 库将其呈现为表格数据。

我建议您使用不关心顺序的不同数据结构，例如列表。 pandas.DataFrame data parameter 可以是 列表字典 或 字典列表

d = []
game_id = [100, 101, 102] #This is an example, I use a large number of games

def get_url(gid):
    url = f"http://examplelink.com/{gid}/"
    req = r.get(url)
    json = req.json()

    for i in json["allPlays"]:
        for p in i.get("runners", []):
            matchup = i.get("matchup", {})
            batter = matchup.get("batter", {})
            pitcher = matchup.get("pitcher", {})
            details = p.get("details", {})
            runner = details.get("runner", {})
            event = details.get("event", {})

            d.append(
                dict(
                    game_id=gid,
                    atbat_num=i.get("atBatIndex"),
                    play_index=details.get("playIndex"),
                    batter_id=batter.get("id"),
                    batter_name=batter.get("fullName"),
                    pitcher=pitcher.get("id"),
                    pitcher_name=pitcher.get("fullName"),
                    runner_id=runner.get("id"),
                    runner_name=runner.get("fullName"),
                    event=details.get("event"),
                    start=p.get("movement", {}).get("start"),
                    end=p.get("movement", {}).get("end"),
                    movementReason=details.get("movementReason"),
                )
            )

    print(f"Game {gid} analyzed")

with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(get_url, game_id)

如何避免在 Python 中使用 concurrent.futures 附加错误？

How can I avoid append errors using concurrent.futures in Python?

python

parallel-processing

pandas

python-requests

concurrent.futures