如何从多个文件创建单个 dask 数组?
How to create a single dask array from multiple files?
我正在尝试从多个文件创建一个 dask array
。我正在使用 dask.array.Array
class 来做到这一点。考虑以下代码片段,我在其中生成大小为 (3, 10, 10)
的 100
随机整数 array
并将它们中的每一个保存在单独的 npy
文件中。然后我试图创建一个 dask array
将所有这些数组组合成一个 dask array
形状 (3, 100, 100)
.
import numpy as np
from itertools import product
from dask import array as da
from dask.base import tokenize
names = list()
for i in range(100):
arr = np.random.randint(0, 9, (3, 10, 10))
fn = 'data/array_{}.npy'.format(i)
np.save(fn, arr)
names.append('Array-{}'.format(tokenize(fn)))
indices = list(product(range(10), range(10)))
dsk = {
(name, 0, *index): (np.load, name)
for name, index in zip(names, indices)
}
namex = 'Combined_Array'
dtype=int
shape = (3, 100, 100)
chunks = (3, 10, 10)
d = da.Array(dsk, namex, chunks, dtype, shape)
不幸的是,它在 normalize_chunks
方法中抛出错误:
TypeError Traceback (most recent call last)
<ipython-input-4-008559464c9e> in <module>
----> 1 d = da.Array(dsk, namex, chunks, dtype, shape)
~/.conda/envs/Py3Dev/lib/python3.7/site-packages/dask/array/core.py in __new__(cls, dask, name, chunks, dtype, meta, shape)
1026 else:
1027 dt = None
-> 1028 self._chunks = normalize_chunks(chunks, shape, dtype=dt)
1029 if self._chunks is None:
1030 raise ValueError(CHUNKS_NONE_ERROR_MESSAGE)
~/.conda/envs/Py3Dev/lib/python3.7/site-packages/dask/array/core.py in normalize_chunks(chunks, shape, limit, dtype, previous_chunks)
2481 )
2482
-> 2483 return tuple(tuple(int(x) if not math.isnan(x) else x for x in c) for c in chunks)
2484
2485
~/.conda/envs/Py3Dev/lib/python3.7/site-packages/dask/array/core.py in <genexpr>(.0)
2481 )
2482
-> 2483 return tuple(tuple(int(x) if not math.isnan(x) else x for x in c) for c in chunks)
2484
2485
TypeError: 'int' object is not iterable
我是不是做错了什么?
将参数作为命名参数传递解决了问题,因为有些参数未包含在此示例中,并且这些参数的顺序已更改。代码中还有其他错误。正确的代码是:
import numpy as np
from itertools import product
from dask import array as da
names = list()
for i in range(100):
arr = np.random.randint(0, 9, (3, 10, 10))
fn = 'data/array_{}.npy'.format(i)
np.save(fn, arr)
names.append(fn)
indices = list(product(range(10), range(10)))
namex = 'Combined_Array'
dsk = {
(namex, 0, *index): (np.load, name)
for name, index in zip(names, indices)
}
dtype=int
shape = (3, 100, 100)
chunks = (3, 10, 10)
d = da.Array(dask=dsk, name=namex, chunks=chunks, dtype=dtype, shape=shape)
详见相关github issue。
我正在尝试从多个文件创建一个 dask array
。我正在使用 dask.array.Array
class 来做到这一点。考虑以下代码片段,我在其中生成大小为 (3, 10, 10)
的 100
随机整数 array
并将它们中的每一个保存在单独的 npy
文件中。然后我试图创建一个 dask array
将所有这些数组组合成一个 dask array
形状 (3, 100, 100)
.
import numpy as np
from itertools import product
from dask import array as da
from dask.base import tokenize
names = list()
for i in range(100):
arr = np.random.randint(0, 9, (3, 10, 10))
fn = 'data/array_{}.npy'.format(i)
np.save(fn, arr)
names.append('Array-{}'.format(tokenize(fn)))
indices = list(product(range(10), range(10)))
dsk = {
(name, 0, *index): (np.load, name)
for name, index in zip(names, indices)
}
namex = 'Combined_Array'
dtype=int
shape = (3, 100, 100)
chunks = (3, 10, 10)
d = da.Array(dsk, namex, chunks, dtype, shape)
不幸的是,它在 normalize_chunks
方法中抛出错误:
TypeError Traceback (most recent call last)
<ipython-input-4-008559464c9e> in <module>
----> 1 d = da.Array(dsk, namex, chunks, dtype, shape)
~/.conda/envs/Py3Dev/lib/python3.7/site-packages/dask/array/core.py in __new__(cls, dask, name, chunks, dtype, meta, shape)
1026 else:
1027 dt = None
-> 1028 self._chunks = normalize_chunks(chunks, shape, dtype=dt)
1029 if self._chunks is None:
1030 raise ValueError(CHUNKS_NONE_ERROR_MESSAGE)
~/.conda/envs/Py3Dev/lib/python3.7/site-packages/dask/array/core.py in normalize_chunks(chunks, shape, limit, dtype, previous_chunks)
2481 )
2482
-> 2483 return tuple(tuple(int(x) if not math.isnan(x) else x for x in c) for c in chunks)
2484
2485
~/.conda/envs/Py3Dev/lib/python3.7/site-packages/dask/array/core.py in <genexpr>(.0)
2481 )
2482
-> 2483 return tuple(tuple(int(x) if not math.isnan(x) else x for x in c) for c in chunks)
2484
2485
TypeError: 'int' object is not iterable
我是不是做错了什么?
将参数作为命名参数传递解决了问题,因为有些参数未包含在此示例中,并且这些参数的顺序已更改。代码中还有其他错误。正确的代码是:
import numpy as np
from itertools import product
from dask import array as da
names = list()
for i in range(100):
arr = np.random.randint(0, 9, (3, 10, 10))
fn = 'data/array_{}.npy'.format(i)
np.save(fn, arr)
names.append(fn)
indices = list(product(range(10), range(10)))
namex = 'Combined_Array'
dsk = {
(namex, 0, *index): (np.load, name)
for name, index in zip(names, indices)
}
dtype=int
shape = (3, 100, 100)
chunks = (3, 10, 10)
d = da.Array(dask=dsk, name=namex, chunks=chunks, dtype=dtype, shape=shape)
详见相关github issue。