通过 python 中的 namedtuple csv 循环跟踪进度

Track progress through namedtuple csv loop in python

使用 collections.namedtuple,以下 Python 代码通过标识符的 csv 文件(名为 ContentItemId 的列中的整数)为数据库中的记录工作。示例记录是 https://api.aucklandmuseum.com/id/library/ephemera/21291.

其目的是检查给定 id 的 HTTP 状态并将其写入磁盘:

import requests
from collections import namedtuple
import csv

with open('in.csv', mode='r') as f:
    reader = csv.reader(f)

    all_records = namedtuple('rec', next(reader))
    records = [all_records._make(row) for row in reader]

    #Create output file
    with open('out.csv', mode='w+') as o:
        w = csv.writer(o)
        w.writerow(["ContentItemId","code"])

        count = 1
        for r in records:
            id   = r.ContentItemId
            url  = "https://api.aucklandmuseum.com/id/library/ephemera/" + id
            req  = requests.get(url, allow_redirects=False)
            code = req.status_code
            w.writerow([id, code])

如何将后一个循环的代码进度(最好是在 25%、50% 和 75% 的接合点)打印到控制台?另外,如果我在底部添加一个未缩进的 print("Complete"),是否会到达该行?

提前致谢。


编辑:感谢所有帮助。我的(工作!)代码现在看起来像这样:

import csv
import requests
import pandas
import time
from collections import namedtuple
from tqdm import tqdm

with open('active_true_pub_no.csv', mode='r') as f:
    reader = csv.reader(f)

    all_records = namedtuple('rec', next(reader))
    records = [all_records._make(row) for row in reader]

    with open('out.csv', mode='w+') as o:
        w = csv.writer(o)
        w.writerow(["ContentItemId","code"])

        num = len(records)
        print("Checking {} records...\n".format(num))

        with tqdm(total=num, bar_format="{percentage:3.0f}% {bar} [{n_fmt}/{total_fmt}]  ", ncols=64) as pbar:
            for r in records:
                pbar.update(1)
                id   = r.ContentItemId
                url  = "https://api.aucklandmuseum.com/id/library/ephemera/" + id
                req  = requests.get(url, allow_redirects=False)
                code = req.status_code
                w.writerow([id, code])
                # time.sleep(.25)

print ('\nSummary: ')
df = pandas.read_csv("out.csv")
print(df['code'].value_counts())

我用pandas' value_counts总结了最后的结果

只需使用枚举来跟踪您的进度,例如:

l = len(records)
for i, r in enumerate(records):
    # other stuff...
    w.writerow([id, code])
    # print progress
    pr = i / l
    if pr in (0.25, 0.5, 0.75, 1.0): 
        print('Writing to disk... {:%}%'.format(pr))

我假设您指的是已处理记录的百分比。您也可以在循环中执行 print("Complete")

count = 0
for r in records:
    id   = r.ContentItemId
    url  = "https://api.aucklandmuseum.com/id/library/ephemera/" + id
    req  = requests.get(url, allow_redirects=False)
    code = req.status_code
    w.writerow([id, code])
    count += 1
    if count == len(records):
        print("Complete")
    # Need the round in case list of records isn't divisible by 4
    elif count % round(len(records) / 4) == 0:
        # Round fraction to two decimal points and multiply by 100 for
        # integer percentage
        progress = int(round(count / len(records), 2) * 100)
        print("{}%".format(progress))

[纯python解决方案]要打印百分比进度和谁在线(即不占用所有屏幕),您可以执行以下操作:

        [...]
        count = 1
        total = len(record)
        for i, r in enumerate(records):
            id   = r.ContentItemId
            url  = "https://api.aucklandmuseum.com/id/library/ephemera/" + id
            req  = requests.get(url, allow_redirects=False)
            code = req.status_code
            w.writerow([id, code])
            print("%.2f \t\r" % (i/total), end='')

要获得进度条,请使用 TQDM:

数据(来自in.csv):

ContentItemId
21200
21201
21202
21203
21204
21205
21206
...
21296
21297
21298
21299
21300

代码:

from collections import namedtuple
import csv
import requests
from tqdm import tqdm


with open('in.csv', mode='r') as f:
    reader = csv.reader(f)

    all_records = namedtuple('rec', next(reader))
    records = [all_records._make(row) for row in reader]

    #Create output file
    with open('out.csv', mode='w+') as o:
        w = csv.writer(o)
        w.writerow(["ContentItemId","code"])

        count = 1

        with tqdm(total=len(records)) as pbar:
            for r in records:
                pbar.update(1)
                id   = r.ContentItemId
                url  = "https://api.aucklandmuseum.com/id/library/ephemera/" + id
                req  = requests.get(url, allow_redirects=False)
                code = req.status_code
                w.writerow([id, code])
    print('Complete!')
  • 注意在 for-loop
  • 之前添加 with tqdm(total=len(records)) as pbar:
  • 当从控制台运行时,会出现一个进度条,显示完成百分比。
  • 注意图像左侧 21/101,这是 records 列表长度的计数。
    • tqdm 提供百分比进度条和 complete/total
    • 的计数
# sudo pip3 install tqdm

import time
import tqdm

records = ['a', 'b', 'c', 'd', 'e']

with tqdm.tqdm(smoothing=0.1, total=len(records)) as pbar:
    for k, record in enumerate(records):
        time.sleep(1)
        pbar.update()


它都是相对的,所以让我们做一些通用的数学运算。 :)

# sudo pip3 install tqdm

import time
import tqdm

total = 5000
_number_left = 5000
with tqdm.tqdm(smoothing=0.1, total=total) as pbar:
    relatively_done = 0
    relatively_done_sum = 0
    for k in range(0, 5000, 2):  # 0, 2, 4, ... 4998
        time.sleep(0.0005)
        _number_left -= 2  # input from some worker process for example
        absolutely_done = total - _number_left
        relatively_done = absolutely_done - relatively_done_sum
        relatively_done_sum += relatively_done
        pbar.update(relatively_done)