如何使用 OpenPyXL 遍历 Excel Table 中的所有行?
How to iterate over all rows in an Excel Table using OpenPyXL?
Excel 表的 OpenPyXL 文档没有提及如何迭代 table 中的值(参见 here)。
这样做的有效方法是什么?
我想出了以下功能来做到这一点。
from typing import Any, Dict, Generator
from openpyxl import load_workbook
from openpyxl.worksheet.table import Table
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.utils import rows_from_range
TableRow = Dict[str, Any]
def iter_table_rows(ws:Worksheet, tb:Table) -> Generator[TableRow, None, None]:
"""Iterate over rows from a table with headers (row as dictionary)"""
def get_row_values(row_cell_ref):
return [ws[c].value for c in row_cell_ref]
iter_rows = rows_from_range(tb.ref)
headers = get_row_values(next(iter_rows))
for row_cells in iter_rows:
yield {h:v for h,v in zip(headers, get_row_values(row_cells))}
wb = load_workbook("my_file.xlsx")
ws = wb.active
tb = ws.tables["MyTable"]
for row in iter_table_rows(ws, tb):
print(row)
感谢您提供的功能。我添加了一个 ws 参数,因为 ws 不是全局变量并且他是 get_row_values.
所必需的
def iter_table_rows(ws:Worksheet, tb:Table) -> Generator[TableRow, None, None]:
"""Iterate over rows from a table with headers (row as dictionary)"""
def get_row_values(ws:Worksheet,row_cell_ref:tuple):
return [ws[c].value for c in row_cell_ref]
iter_rows = rows_from_range(tb.ref)
headers = get_row_values(ws,next(iter_rows))
for row_cells in iter_rows:
yield {h:v for h,v in zip(headers, get_row_values(ws,row_cells))}
目前我还没有看到其他方法。
并且 没有“父”属性可以知道哪个工作表 是 table。
好吧,最后我得出了一个不同的解决方案。
这是从 json 文件中的 xlsx table 中提取数据的函数代码。
我猜它更接近 openpyxl 的理念。数据存储在工作表中,table 似乎是一种指向范围的指针。在工作表中读取数据的最有效方法是使用 ws.iter_row 根据 min_row、max_row、min_col、max_col.[=12 指定范围=]
import json
import logging
from argparse import Namespace
from openpyxl import load_workbook
from openpyxl.worksheet.cell_range import CellRange
def xlsx_Json(args: Namespace):
"""Extract table values from an excel workbook in a json file
Args:
args (Namespace): parsed arguments
args.workbook : filename of the excel document
args.worksheet : name of the sheet to read
args.table : name of a table in worksheet
args.output : name of a output file (json format)
Note:
It's assumed that table exists in worksheet, and worksheet exists in workbook, and file exists
"""
#wb = load_workbook(filename=args.workbook,read_only=True) !!! read only worksheet doesn't have table ?
wb = load_workbook(filename=args.workbook)
ws = wb[args.worksheet]
table = ws.tables[args.table]
# table.ref needs to be converted in a CellRange, and use the range to read data in the worksheet
table_range = CellRange(table.ref)
tableJson={ args.table : [] }
header=table.column_names
for nb_rows,row in enumerate(ws.iter_rows(min_row=table_range.min_row+1,
min_col=table_range.min_col,
max_row=table_range.max_row,
max_col=table_range.max_col,
values_only=True )):
# min_row=range.min_row+1 skip the header row
# need to convert row element to string to make a serializablejson object
# value such datetime are not directly serializable
if nb_rows == 0:
for h,x in zip(header,row):
logging.info(f"{h}:type-{type(x)}:value:{x}")
if nb_rows % 100 == 0:
# print row number every 100 lines
logging.info(f"rows:{nb_rows}")
row_Json=dict(zip(header,[str(x) for x in row]))
tableJson[args.table].append(row_Json)
with open(args.json_output,"w") as file_out:
json.dump(tableJson,file_out,indent=4)
logging.info(f"{nb_rows} elements written in {args.json_output}")
Excel 表的 OpenPyXL 文档没有提及如何迭代 table 中的值(参见 here)。
这样做的有效方法是什么?
我想出了以下功能来做到这一点。
from typing import Any, Dict, Generator
from openpyxl import load_workbook
from openpyxl.worksheet.table import Table
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.utils import rows_from_range
TableRow = Dict[str, Any]
def iter_table_rows(ws:Worksheet, tb:Table) -> Generator[TableRow, None, None]:
"""Iterate over rows from a table with headers (row as dictionary)"""
def get_row_values(row_cell_ref):
return [ws[c].value for c in row_cell_ref]
iter_rows = rows_from_range(tb.ref)
headers = get_row_values(next(iter_rows))
for row_cells in iter_rows:
yield {h:v for h,v in zip(headers, get_row_values(row_cells))}
wb = load_workbook("my_file.xlsx")
ws = wb.active
tb = ws.tables["MyTable"]
for row in iter_table_rows(ws, tb):
print(row)
感谢您提供的功能。我添加了一个 ws 参数,因为 ws 不是全局变量并且他是 get_row_values.
所必需的def iter_table_rows(ws:Worksheet, tb:Table) -> Generator[TableRow, None, None]:
"""Iterate over rows from a table with headers (row as dictionary)"""
def get_row_values(ws:Worksheet,row_cell_ref:tuple):
return [ws[c].value for c in row_cell_ref]
iter_rows = rows_from_range(tb.ref)
headers = get_row_values(ws,next(iter_rows))
for row_cells in iter_rows:
yield {h:v for h,v in zip(headers, get_row_values(ws,row_cells))}
目前我还没有看到其他方法。
并且 没有“父”属性可以知道哪个工作表 是 table。
好吧,最后我得出了一个不同的解决方案。
这是从 json 文件中的 xlsx table 中提取数据的函数代码。
我猜它更接近 openpyxl 的理念。数据存储在工作表中,table 似乎是一种指向范围的指针。在工作表中读取数据的最有效方法是使用 ws.iter_row 根据 min_row、max_row、min_col、max_col.[=12 指定范围=]
import json
import logging
from argparse import Namespace
from openpyxl import load_workbook
from openpyxl.worksheet.cell_range import CellRange
def xlsx_Json(args: Namespace):
"""Extract table values from an excel workbook in a json file
Args:
args (Namespace): parsed arguments
args.workbook : filename of the excel document
args.worksheet : name of the sheet to read
args.table : name of a table in worksheet
args.output : name of a output file (json format)
Note:
It's assumed that table exists in worksheet, and worksheet exists in workbook, and file exists
"""
#wb = load_workbook(filename=args.workbook,read_only=True) !!! read only worksheet doesn't have table ?
wb = load_workbook(filename=args.workbook)
ws = wb[args.worksheet]
table = ws.tables[args.table]
# table.ref needs to be converted in a CellRange, and use the range to read data in the worksheet
table_range = CellRange(table.ref)
tableJson={ args.table : [] }
header=table.column_names
for nb_rows,row in enumerate(ws.iter_rows(min_row=table_range.min_row+1,
min_col=table_range.min_col,
max_row=table_range.max_row,
max_col=table_range.max_col,
values_only=True )):
# min_row=range.min_row+1 skip the header row
# need to convert row element to string to make a serializablejson object
# value such datetime are not directly serializable
if nb_rows == 0:
for h,x in zip(header,row):
logging.info(f"{h}:type-{type(x)}:value:{x}")
if nb_rows % 100 == 0:
# print row number every 100 lines
logging.info(f"rows:{nb_rows}")
row_Json=dict(zip(header,[str(x) for x in row]))
tableJson[args.table].append(row_Json)
with open(args.json_output,"w") as file_out:
json.dump(tableJson,file_out,indent=4)
logging.info(f"{nb_rows} elements written in {args.json_output}")