读取 HDF5 作为 Dask Dataframe 时出错,为什么?
Error when reading HDF5 as a Dask Dataframe, why?
1.我的问题
我在尝试使用 Dask 读取我的 HDF5 文件时遇到下一个错误,我不知道为什么
>>> dd.read_hdf("test.h5", key="/RECORDS/STATES")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/io/hdf.py", line 514, in read_hdf
for path in paths
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/io/hdf.py", line 514, in <listcomp>
for path in paths
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/io/hdf.py", line 382, in _read_single_hdf
for k, s, d in zip(keys, stops, divisions)
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/multi.py", line 1071, in concat
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
2。 HDF5 文件
我尝试用 Dask 读取的文件是我使用 HDF5 的 C API 生成的。如果你问,我使用 C 生成 HDF5,而不是 Python(numpy,pandas),为了性能,因为我需要解析许多 GB 的 ASCII 格式的未格式化数据。数据作为 HDF5 Table (https://portal.hdfgroup.org/display/HDF5/Tables) 存储在文件中。我的文件 header 如下所示:
HDF5 "rhoPimpleExtrae10TimeSteps.00.1iter.h5" {
GROUP "/" {
ATTRIBUTE "hdf5_metadata_apps" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_date" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_hwcpu" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 48 ) / ( 48 ) }
}
ATTRIBUTE "hdf5_metadata_hwnodes" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
}
ATTRIBUTE "hdf5_metadata_name" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_nodes" {
DATATYPE H5T_STD_I64LE
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
}
ATTRIBUTE "hdf5_metadata_path" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_threads" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 48 ) / ( 48 ) }
}
ATTRIBUTE "hdf5_metadata_time" {
DATATYPE H5T_STD_I64LE
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_type" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
GROUP "RECORDS" {
DATASET "COMMUNICATIONS" {
DATATYPE H5T_COMPOUND {
H5T_STD_U32LE "CPU Send ID";
H5T_STD_U32LE "Phy. Task Send ID";
H5T_STD_U32LE "Log. Task Send ID";
H5T_STD_U32LE "Thread Send ID";
H5T_STD_U64LE "Log. Send Time";
H5T_STD_U64LE "Phy. Send Time";
H5T_STD_U32LE "CPU Receive ID";
H5T_STD_U32LE "Phy. Task Receive ID";
H5T_STD_U32LE "Log. Task Receive ID";
H5T_STD_U32LE "Thread Receive ID";
H5T_STD_U64LE "Log. Receive Time";
H5T_STD_U64LE "Phy. Receive Time";
H5T_STD_U64LE "Size";
H5T_STD_U64LE "Tag";
}
DATASPACE SIMPLE { ( 67574 ) / ( H5S_UNLIMITED ) }
ATTRIBUTE "CLASS" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_0_NAME" {
DATATYPE H5T_STRING {
STRSIZE 12;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_10_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_11_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_12_NAME" {
DATATYPE H5T_STRING {
STRSIZE 5;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_13_NAME" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_1_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_2_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_3_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_4_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_5_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_6_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_7_NAME" {
DATATYPE H5T_STRING {
STRSIZE 21;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_8_NAME" {
DATATYPE H5T_STRING {
STRSIZE 21;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_9_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "TITLE" {
DATATYPE H5T_STRING {
STRSIZE 22;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "VERSION" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
}
DATASET "EVENTS" {
DATATYPE H5T_COMPOUND {
H5T_STD_U32LE "CPU ID";
H5T_STD_U16LE "APP ID";
H5T_STD_U32LE "Task ID";
H5T_STD_U32LE "Thread ID";
H5T_STD_U64LE "Time";
H5T_STD_U64LE "Event Type";
H5T_STD_U64LE "Event Value";
}
DATASPACE SIMPLE { ( 3643006 ) / ( H5S_UNLIMITED ) }
ATTRIBUTE "CLASS" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_0_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_1_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_2_NAME" {
DATATYPE H5T_STRING {
STRSIZE 8;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_3_NAME" {
DATATYPE H5T_STRING {
STRSIZE 10;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_4_NAME" {
DATATYPE H5T_STRING {
STRSIZE 5;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_5_NAME" {
DATATYPE H5T_STRING {
STRSIZE 11;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_6_NAME" {
DATATYPE H5T_STRING {
STRSIZE 12;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "TITLE" {
DATATYPE H5T_STRING {
STRSIZE 14;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "VERSION" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
}
DATASET "STATES" {
DATATYPE H5T_COMPOUND {
H5T_STD_U32LE "CPU ID";
H5T_STD_U16LE "APP ID";
H5T_STD_U32LE "Task ID";
H5T_STD_U32LE "Thread ID";
H5T_STD_U64LE "Time ini";
H5T_STD_U64LE "Time fi";
H5T_STD_U16LE "State";
}
DATASPACE SIMPLE { ( 301496 ) / ( H5S_UNLIMITED ) }
ATTRIBUTE "CLASS" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_0_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_1_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_2_NAME" {
DATATYPE H5T_STRING {
STRSIZE 8;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_3_NAME" {
DATATYPE H5T_STRING {
STRSIZE 10;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_4_NAME" {
DATATYPE H5T_STRING {
STRSIZE 9;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_5_NAME" {
DATATYPE H5T_STRING {
STRSIZE 8;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_6_NAME" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "TITLE" {
DATATYPE H5T_STRING {
STRSIZE 14;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "VERSION" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
}
}
}
}
/RECORDS 下基本上有 3 个数据集(STATES、EVENTS 和 COMMUNICATIONS)。我认为我的 HDF5 没有任何奇怪的地方。
我尝试使用 Pandas 和 Dask 数组加载这些数据集并且它有效。
3。我想知道什么
我的 HDF5 文件有什么问题导致 Dask 无法将其作为数据帧读取?
我试图在 Dask 文档中找到 HDF5 文件必须满足的要求,但没有涵盖该主题。如果至少我知道我的文件有什么问题,我就能修复它。
PR https://github.com/dask/dask/pull/6204 最近被合并到 dask master 中,幸运的是,它为你解决了这个问题。
1.我的问题
我在尝试使用 Dask 读取我的 HDF5 文件时遇到下一个错误,我不知道为什么
>>> dd.read_hdf("test.h5", key="/RECORDS/STATES")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/io/hdf.py", line 514, in read_hdf
for path in paths
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/io/hdf.py", line 514, in <listcomp>
for path in paths
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/io/hdf.py", line 382, in _read_single_hdf
for k, s, d in zip(keys, stops, divisions)
File "/usr/local/lib/python3.7/site-packages/dask/dataframe/multi.py", line 1071, in concat
raise ValueError("No objects to concatenate")
ValueError: No objects to concatenate
2。 HDF5 文件
我尝试用 Dask 读取的文件是我使用 HDF5 的 C API 生成的。如果你问,我使用 C 生成 HDF5,而不是 Python(numpy,pandas),为了性能,因为我需要解析许多 GB 的 ASCII 格式的未格式化数据。数据作为 HDF5 Table (https://portal.hdfgroup.org/display/HDF5/Tables) 存储在文件中。我的文件 header 如下所示:
HDF5 "rhoPimpleExtrae10TimeSteps.00.1iter.h5" {
GROUP "/" {
ATTRIBUTE "hdf5_metadata_apps" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_date" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_hwcpu" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 48 ) / ( 48 ) }
}
ATTRIBUTE "hdf5_metadata_hwnodes" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
}
ATTRIBUTE "hdf5_metadata_name" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_nodes" {
DATATYPE H5T_STD_I64LE
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
}
ATTRIBUTE "hdf5_metadata_path" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_threads" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 48 ) / ( 48 ) }
}
ATTRIBUTE "hdf5_metadata_time" {
DATATYPE H5T_STD_I64LE
DATASPACE SCALAR
}
ATTRIBUTE "hdf5_metadata_type" {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
GROUP "RECORDS" {
DATASET "COMMUNICATIONS" {
DATATYPE H5T_COMPOUND {
H5T_STD_U32LE "CPU Send ID";
H5T_STD_U32LE "Phy. Task Send ID";
H5T_STD_U32LE "Log. Task Send ID";
H5T_STD_U32LE "Thread Send ID";
H5T_STD_U64LE "Log. Send Time";
H5T_STD_U64LE "Phy. Send Time";
H5T_STD_U32LE "CPU Receive ID";
H5T_STD_U32LE "Phy. Task Receive ID";
H5T_STD_U32LE "Log. Task Receive ID";
H5T_STD_U32LE "Thread Receive ID";
H5T_STD_U64LE "Log. Receive Time";
H5T_STD_U64LE "Phy. Receive Time";
H5T_STD_U64LE "Size";
H5T_STD_U64LE "Tag";
}
DATASPACE SIMPLE { ( 67574 ) / ( H5S_UNLIMITED ) }
ATTRIBUTE "CLASS" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_0_NAME" {
DATATYPE H5T_STRING {
STRSIZE 12;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_10_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_11_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_12_NAME" {
DATATYPE H5T_STRING {
STRSIZE 5;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_13_NAME" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_1_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_2_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_3_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_4_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_5_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_6_NAME" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_7_NAME" {
DATATYPE H5T_STRING {
STRSIZE 21;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_8_NAME" {
DATATYPE H5T_STRING {
STRSIZE 21;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_9_NAME" {
DATATYPE H5T_STRING {
STRSIZE 18;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "TITLE" {
DATATYPE H5T_STRING {
STRSIZE 22;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "VERSION" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
}
DATASET "EVENTS" {
DATATYPE H5T_COMPOUND {
H5T_STD_U32LE "CPU ID";
H5T_STD_U16LE "APP ID";
H5T_STD_U32LE "Task ID";
H5T_STD_U32LE "Thread ID";
H5T_STD_U64LE "Time";
H5T_STD_U64LE "Event Type";
H5T_STD_U64LE "Event Value";
}
DATASPACE SIMPLE { ( 3643006 ) / ( H5S_UNLIMITED ) }
ATTRIBUTE "CLASS" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_0_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_1_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_2_NAME" {
DATATYPE H5T_STRING {
STRSIZE 8;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_3_NAME" {
DATATYPE H5T_STRING {
STRSIZE 10;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_4_NAME" {
DATATYPE H5T_STRING {
STRSIZE 5;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_5_NAME" {
DATATYPE H5T_STRING {
STRSIZE 11;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_6_NAME" {
DATATYPE H5T_STRING {
STRSIZE 12;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "TITLE" {
DATATYPE H5T_STRING {
STRSIZE 14;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "VERSION" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
}
DATASET "STATES" {
DATATYPE H5T_COMPOUND {
H5T_STD_U32LE "CPU ID";
H5T_STD_U16LE "APP ID";
H5T_STD_U32LE "Task ID";
H5T_STD_U32LE "Thread ID";
H5T_STD_U64LE "Time ini";
H5T_STD_U64LE "Time fi";
H5T_STD_U16LE "State";
}
DATASPACE SIMPLE { ( 301496 ) / ( H5S_UNLIMITED ) }
ATTRIBUTE "CLASS" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_0_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_1_NAME" {
DATATYPE H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_2_NAME" {
DATATYPE H5T_STRING {
STRSIZE 8;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_3_NAME" {
DATATYPE H5T_STRING {
STRSIZE 10;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_4_NAME" {
DATATYPE H5T_STRING {
STRSIZE 9;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_5_NAME" {
DATATYPE H5T_STRING {
STRSIZE 8;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "FIELD_6_NAME" {
DATATYPE H5T_STRING {
STRSIZE 6;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "TITLE" {
DATATYPE H5T_STRING {
STRSIZE 14;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
ATTRIBUTE "VERSION" {
DATATYPE H5T_STRING {
STRSIZE 4;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
}
}
}
}
}
/RECORDS 下基本上有 3 个数据集(STATES、EVENTS 和 COMMUNICATIONS)。我认为我的 HDF5 没有任何奇怪的地方。 我尝试使用 Pandas 和 Dask 数组加载这些数据集并且它有效。
3。我想知道什么
我的 HDF5 文件有什么问题导致 Dask 无法将其作为数据帧读取?
我试图在 Dask 文档中找到 HDF5 文件必须满足的要求,但没有涵盖该主题。如果至少我知道我的文件有什么问题,我就能修复它。
PR https://github.com/dask/dask/pull/6204 最近被合并到 dask master 中,幸运的是,它为你解决了这个问题。