我能否将目录路径转换为可以输入 python hdf5 数据 table 的内容?
Am I able to convert a directory path into something that can be fed into a python hdf5 data table?
我想知道如何将字符串或路径转换成可以输入 hdf5 的内容 table。例如,我从 Pytorch 数据加载器返回一个 numpy img 数组、标签和图像路径,其中图像路径如下所示:
('mults/train/0/5678.ndpi/40x/40x-236247-16634-80384-8704.png',)
我基本上想像这样将其输入 hdf5 table:
hdf5_file = h5py.File(path, mode='w')
hdf5_file.create_dataset(str(phase) + '_img_paths', (len(dataloaders_dict[phase]),))
我不太确定我想做的事情是否可行。也许我将这些数据输入 table.
是错误的
我试过:
hdf5_file.create_dataset(str(phase) + '_img_paths', (len(dataloaders_dict[phase]),),dtype="S10")
但是得到这个错误:
hdf5_file[str(phase) + '_img_paths'][i] = str(paths40x)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/anaconda3/lib/python3.6/site-packages/h5py/_hl/dataset.py", line 708, in __setitem__
self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5d.pyx", line 211, in h5py.h5d.DatasetID.write
File "h5py/h5t.pyx", line 1652, in h5py.h5t.py_create
File "h5py/h5t.pyx", line 1713, in h5py.h5t.py_create
TypeError: No conversion path for dtype: dtype('<U64')
在保存字符串数据方面,您有两种选择:
- 您可以在 h5py 或 PyTables 中创建标准数据集,并使用
任意大的字符串大小。这是最简单的方法,但存在任意大字符串不够大的风险。 :)
- 或者,可以创建可变长度数据集。 PyTables 将此数据集类型称为 VLArray,它使用的对象是 Class VLStringAtom()。 h5py 使用标准数据集,但 dtype 引用 special_dtype(vlen=str)(请注意,如果您使用的是 h5py 2.10,则可以改用 string_dtype())。
我创建了一个示例来展示如何为 PyTables 和 h5py 执行此操作。它是围绕您评论中引用的程序构建的。我没有复制所有代码——只是复制了检索文件名和打乱文件名所必需的代码。此外,我发现的 kaggle 数据集具有不同的目录结构,因此我修改了 cat_dog_train_path
变量以匹配。
from random import shuffle
import glob
shuffle_data = True # shuffle the addresses before saving
cat_dog_train_path = '.\PetImages\*\*.jpg'
# read addresses and labels from the 'train' folder
addrs = glob.glob(cat_dog_train_path, recursive=True)
print (len(addrs))
labels = [0 if 'cat' in addr else 1 for addr in addrs] # 0 = Cat, 1 = Dog
# to shuffle data
if shuffle_data:
c = list(zip(addrs, labels))
shuffle(c)
addrs, labels = zip(*c)
# Divide the data into 10% train only, no validation or test
train_addrs = addrs[0:int(0.1*len(addrs))]
train_labels = labels[0:int(0.1*len(labels))]
print ('Check glob list data:')
print (train_addrs[0])
print (train_addrs[-1])
import tables as tb
# Create a hdf5 file with PyTaables and create VLArrays
# filename to save the hdf5 file
hdf5_path = 'PetImages_data_1.h5'
with tb.File(hdf5_path, mode='w') as h5f:
train_files_ds = h5f.create_vlarray('/', 'train_files',
atom=tb.VLStringAtom() )
# loop over train addresses
for i in range(len(train_addrs)):
# print how many images are saved every 1000 images
if i % 500 == 0 and i > 1:
print ('Train data: {}/{}'.format(i, len(train_addrs)) )
addr = train_addrs[i]
train_files_ds.append(train_addrs[i].encode('utf-8'))
with tb.File(hdf5_path, mode='r') as h5f:
train_files_ds = h5f.root.train_files
print ('Check PyTables data:')
print (train_files_ds[0].decode('utf-8'))
print (train_files_ds[-1].decode('utf-8'))
import h5py
# Create a hdf5 file with h5py and create VLArrays
# filename to save the hdf5 file
hdf5_path = 'PetImages_data_2.h5'
with h5py.File(hdf5_path, mode='w') as h5f:
dt = h5py.special_dtype(vlen=str) # can use string_dtype() wiuth h5py 2.10
train_files_ds = h5f.create_dataset('/train_files', (len(train_addrs),),
dtype=dt )
# loop over train addresses
for i in range(len(train_addrs)):
# print how many images are saved every 1000 images
if i % 500 == 0 and i > 1:
print ('Train data: {}/{}'.format(i, len(train_addrs)) )
addr = train_addrs[i]
train_files_ds[i]= train_addrs[i]
with h5py.File(hdf5_path, mode='r') as h5f:
train_files_ds = h5f['train_files']
print ('Check h5py data:')
print (train_files_ds[0])
print (train_files_ds[-1])
我想知道如何将字符串或路径转换成可以输入 hdf5 的内容 table。例如,我从 Pytorch 数据加载器返回一个 numpy img 数组、标签和图像路径,其中图像路径如下所示:
('mults/train/0/5678.ndpi/40x/40x-236247-16634-80384-8704.png',)
我基本上想像这样将其输入 hdf5 table:
hdf5_file = h5py.File(path, mode='w')
hdf5_file.create_dataset(str(phase) + '_img_paths', (len(dataloaders_dict[phase]),))
我不太确定我想做的事情是否可行。也许我将这些数据输入 table.
是错误的我试过:
hdf5_file.create_dataset(str(phase) + '_img_paths', (len(dataloaders_dict[phase]),),dtype="S10")
但是得到这个错误:
hdf5_file[str(phase) + '_img_paths'][i] = str(paths40x)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/anaconda3/lib/python3.6/site-packages/h5py/_hl/dataset.py", line 708, in __setitem__
self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "h5py/h5d.pyx", line 211, in h5py.h5d.DatasetID.write
File "h5py/h5t.pyx", line 1652, in h5py.h5t.py_create
File "h5py/h5t.pyx", line 1713, in h5py.h5t.py_create
TypeError: No conversion path for dtype: dtype('<U64')
在保存字符串数据方面,您有两种选择:
- 您可以在 h5py 或 PyTables 中创建标准数据集,并使用 任意大的字符串大小。这是最简单的方法,但存在任意大字符串不够大的风险。 :)
- 或者,可以创建可变长度数据集。 PyTables 将此数据集类型称为 VLArray,它使用的对象是 Class VLStringAtom()。 h5py 使用标准数据集,但 dtype 引用 special_dtype(vlen=str)(请注意,如果您使用的是 h5py 2.10,则可以改用 string_dtype())。
我创建了一个示例来展示如何为 PyTables 和 h5py 执行此操作。它是围绕您评论中引用的程序构建的。我没有复制所有代码——只是复制了检索文件名和打乱文件名所必需的代码。此外,我发现的 kaggle 数据集具有不同的目录结构,因此我修改了 cat_dog_train_path
变量以匹配。
from random import shuffle
import glob
shuffle_data = True # shuffle the addresses before saving
cat_dog_train_path = '.\PetImages\*\*.jpg'
# read addresses and labels from the 'train' folder
addrs = glob.glob(cat_dog_train_path, recursive=True)
print (len(addrs))
labels = [0 if 'cat' in addr else 1 for addr in addrs] # 0 = Cat, 1 = Dog
# to shuffle data
if shuffle_data:
c = list(zip(addrs, labels))
shuffle(c)
addrs, labels = zip(*c)
# Divide the data into 10% train only, no validation or test
train_addrs = addrs[0:int(0.1*len(addrs))]
train_labels = labels[0:int(0.1*len(labels))]
print ('Check glob list data:')
print (train_addrs[0])
print (train_addrs[-1])
import tables as tb
# Create a hdf5 file with PyTaables and create VLArrays
# filename to save the hdf5 file
hdf5_path = 'PetImages_data_1.h5'
with tb.File(hdf5_path, mode='w') as h5f:
train_files_ds = h5f.create_vlarray('/', 'train_files',
atom=tb.VLStringAtom() )
# loop over train addresses
for i in range(len(train_addrs)):
# print how many images are saved every 1000 images
if i % 500 == 0 and i > 1:
print ('Train data: {}/{}'.format(i, len(train_addrs)) )
addr = train_addrs[i]
train_files_ds.append(train_addrs[i].encode('utf-8'))
with tb.File(hdf5_path, mode='r') as h5f:
train_files_ds = h5f.root.train_files
print ('Check PyTables data:')
print (train_files_ds[0].decode('utf-8'))
print (train_files_ds[-1].decode('utf-8'))
import h5py
# Create a hdf5 file with h5py and create VLArrays
# filename to save the hdf5 file
hdf5_path = 'PetImages_data_2.h5'
with h5py.File(hdf5_path, mode='w') as h5f:
dt = h5py.special_dtype(vlen=str) # can use string_dtype() wiuth h5py 2.10
train_files_ds = h5f.create_dataset('/train_files', (len(train_addrs),),
dtype=dt )
# loop over train addresses
for i in range(len(train_addrs)):
# print how many images are saved every 1000 images
if i % 500 == 0 and i > 1:
print ('Train data: {}/{}'.format(i, len(train_addrs)) )
addr = train_addrs[i]
train_files_ds[i]= train_addrs[i]
with h5py.File(hdf5_path, mode='r') as h5f:
train_files_ds = h5f['train_files']
print ('Check h5py data:')
print (train_files_ds[0])
print (train_files_ds[-1])