使用 numpy 将巨大的 2 字节字符串数组根据固定映射转换为相应的 1 字节字符串
Use numpy to translate huge array of 2-byte strings to corresponding 1-byte strings according to a fixed mapping
我有一组 12 个不同的 2 字节字符串,根据以下翻译字典映射到一组 12 个对应的 1 字节字符串:
translation_dict = {'AC': '2', 'AG': '3', 'AT': '4',
'CA': '5', 'CG': '6', 'CT': '7',
'GA': '8', 'GC': '9', 'GT': 'a',
'TA': 'b', 'TC': 'c', 'TG': 'd'}
我需要一些方法来将巨大的 numpy.char.array
2 字节字符串转换为相应的 1 字节字符串映射,如下例所示:
>>> input_array = numpy.char.array(['CA', 'CA', 'GC', 'TC', 'AT', 'GT', 'AG', 'CT'])
>>> output_array = some_method(input_arr)
>>> output_array
chararray(['5', '5', '9', 'c', '4', 'a', '3', '7'], dtype='S1')
我想知道是否有快速 numpy.char.array 方法来翻译巨大的 2 字节字符串数组;我知道我可以将 'numpy.vectorize' 与一个函数一起使用,该函数显式查找每个 2 字节键的 1 字节字典值,但这相对较慢。我不知道如何使用 numpy.chararray.translate
,尽管它似乎在任何情况下都只适用于 1 字节:1 字节映射。
对于这样的搜索操作,NumPy 有 np.searchsorted
,所以请允许我提出一个方法 -
def search_dic(dic, search_keys):
# Extract out keys and values
k = dic.keys()
v = dic.values()
# Use searchsorted to locate the indices
sidx = np.argsort(k)
idx = np.searchsorted(k,search_keys, sorter=sidx)
# Finally index and extract out the corresponding values
return np.take(v,sidx[idx])
样本运行-
In [46]: translation_dict = {'AC': '2', 'AG': '3', 'AT': '4',
...: 'CA': '5', 'CG': '6', 'CT': '7',
...: 'GA': '8', 'GC': '9', 'GT': 'a',
...: 'TA': 'b', 'TC': 'c', 'TG': 'd'}
In [47]: s = np.char.array(['CA', 'CA', 'GC', 'TC', 'AT', 'GT', 'AG', 'CT'])
In [48]: search_dic(translation_dict, s)
Out[48]:
array(['5', '5', '9', 'c', '4', 'a', '3', '7'],
dtype='|S1')
如何搜索最小元素并重新索引:
uniq, inv_idx = np.unique(input_array, return_inverse=True)
np.array([translation_dict[u] for u in uniq])[inv_idx]
#array(['5', '5', '9', 'c', '4', 'a', '3', '7'],
# dtype='<U1')
基准:
import time
x = np.random.choice(list(translation_dict.keys()),1000000)
t = time.time()
uniq, inv_idx = np.unique(x, return_inverse=True)
res = np.array([translation_dict[u] for u in uniq])[inv_idx]
print("Colonel Beauvel timing is:" + (time.time()-t))
t = time.time()
res = search_dic(translation_dict, x)
print("Divakar timimng is:" + str(time.time()-t))
#Colonel Beauvel timing is:0.32760000228881836
#Divakar timing is:0.10920000076293945
Diwakar 轻松获胜,胜出三倍!
这是一个使用便宜 "hash":
import numpy as np
from timeit import timeit
translation_dict = {'AC': '2', 'AG': '3', 'AT': '4',
'CA': '5', 'CG': '6', 'CT': '7',
'GA': '8', 'GC': '9', 'GT': 'a',
'TA': 'b', 'TC': 'c', 'TG': 'd'}
keys, values = map(np.char.array, zip(*translation_dict.items()))
N = 1000000
mock_data = keys[np.random.randint(0,12,(N,))]
def lookup(hash_fun, td, data):
keys, values = map(np.char.array, zip(*td.items()))
keys_ = hash_fun(keys)
assert len(set(keys_)) == len(keys)
data = hash_fun(data)
lookup = np.empty(max(keys_) + 1, values.dtype)
lookup[keys_] = values
return lookup[data].view(np.chararray)
def hash_12(table):
unit = {8:np.uint32, 4:np.uint16, 2:np.uint8}[table.dtype.itemsize]
lookup = table.view(np.ndarray).view(unit)
return (lookup[1::2]<<1) + lookup[::2]
def search_dic(dic, search_keys):
# Extract out keys and values
k = dic.keys()
v = dic.values()
# Use searchsorted to locate the indices
sidx = np.argsort(k)
idx = np.searchsorted(k, search_keys.view(np.ndarray), sorter=sidx)
# Finally index and extract out the corresponding values
return np.take(v,sidx[idx])
def uniq(translation_dict, input_array):
uniq, inv_idx = np.unique(input_array, return_inverse=True)
return np.char.array([translation_dict[u] for u in uniq])[inv_idx]
# correctness
print(np.all(lookup(hash_12, translation_dict, mock_data)
== search_dic(translation_dict, mock_data)))
print(np.all(lookup(hash_12, translation_dict, mock_data)
== uniq(translation_dict, mock_data)))
# performance
print('C_Beauvel {:9.6f} secs'.format(timeit(lambda: uniq(
translation_dict, mock_data), number=10)/10))
print('Divakar {:9.6f} secs'.format(timeit(lambda: search_dic(
translation_dict, mock_data), number=10)/10))
print('PP {:9.6f} secs'.format(timeit(lambda: lookup(
hash_12, translation_dict, mock_data), number=10)/10))
打印:
True
True
C_Beauvel 0.622123 secs
Divakar 0.050903 secs
PP 0.011464 secs
我有一组 12 个不同的 2 字节字符串,根据以下翻译字典映射到一组 12 个对应的 1 字节字符串:
translation_dict = {'AC': '2', 'AG': '3', 'AT': '4',
'CA': '5', 'CG': '6', 'CT': '7',
'GA': '8', 'GC': '9', 'GT': 'a',
'TA': 'b', 'TC': 'c', 'TG': 'd'}
我需要一些方法来将巨大的 numpy.char.array
2 字节字符串转换为相应的 1 字节字符串映射,如下例所示:
>>> input_array = numpy.char.array(['CA', 'CA', 'GC', 'TC', 'AT', 'GT', 'AG', 'CT'])
>>> output_array = some_method(input_arr)
>>> output_array
chararray(['5', '5', '9', 'c', '4', 'a', '3', '7'], dtype='S1')
我想知道是否有快速 numpy.char.array 方法来翻译巨大的 2 字节字符串数组;我知道我可以将 'numpy.vectorize' 与一个函数一起使用,该函数显式查找每个 2 字节键的 1 字节字典值,但这相对较慢。我不知道如何使用 numpy.chararray.translate
,尽管它似乎在任何情况下都只适用于 1 字节:1 字节映射。
对于这样的搜索操作,NumPy 有 np.searchsorted
,所以请允许我提出一个方法 -
def search_dic(dic, search_keys):
# Extract out keys and values
k = dic.keys()
v = dic.values()
# Use searchsorted to locate the indices
sidx = np.argsort(k)
idx = np.searchsorted(k,search_keys, sorter=sidx)
# Finally index and extract out the corresponding values
return np.take(v,sidx[idx])
样本运行-
In [46]: translation_dict = {'AC': '2', 'AG': '3', 'AT': '4',
...: 'CA': '5', 'CG': '6', 'CT': '7',
...: 'GA': '8', 'GC': '9', 'GT': 'a',
...: 'TA': 'b', 'TC': 'c', 'TG': 'd'}
In [47]: s = np.char.array(['CA', 'CA', 'GC', 'TC', 'AT', 'GT', 'AG', 'CT'])
In [48]: search_dic(translation_dict, s)
Out[48]:
array(['5', '5', '9', 'c', '4', 'a', '3', '7'],
dtype='|S1')
如何搜索最小元素并重新索引:
uniq, inv_idx = np.unique(input_array, return_inverse=True)
np.array([translation_dict[u] for u in uniq])[inv_idx]
#array(['5', '5', '9', 'c', '4', 'a', '3', '7'],
# dtype='<U1')
基准:
import time
x = np.random.choice(list(translation_dict.keys()),1000000)
t = time.time()
uniq, inv_idx = np.unique(x, return_inverse=True)
res = np.array([translation_dict[u] for u in uniq])[inv_idx]
print("Colonel Beauvel timing is:" + (time.time()-t))
t = time.time()
res = search_dic(translation_dict, x)
print("Divakar timimng is:" + str(time.time()-t))
#Colonel Beauvel timing is:0.32760000228881836
#Divakar timing is:0.10920000076293945
Diwakar 轻松获胜,胜出三倍!
这是一个使用便宜 "hash":
import numpy as np
from timeit import timeit
translation_dict = {'AC': '2', 'AG': '3', 'AT': '4',
'CA': '5', 'CG': '6', 'CT': '7',
'GA': '8', 'GC': '9', 'GT': 'a',
'TA': 'b', 'TC': 'c', 'TG': 'd'}
keys, values = map(np.char.array, zip(*translation_dict.items()))
N = 1000000
mock_data = keys[np.random.randint(0,12,(N,))]
def lookup(hash_fun, td, data):
keys, values = map(np.char.array, zip(*td.items()))
keys_ = hash_fun(keys)
assert len(set(keys_)) == len(keys)
data = hash_fun(data)
lookup = np.empty(max(keys_) + 1, values.dtype)
lookup[keys_] = values
return lookup[data].view(np.chararray)
def hash_12(table):
unit = {8:np.uint32, 4:np.uint16, 2:np.uint8}[table.dtype.itemsize]
lookup = table.view(np.ndarray).view(unit)
return (lookup[1::2]<<1) + lookup[::2]
def search_dic(dic, search_keys):
# Extract out keys and values
k = dic.keys()
v = dic.values()
# Use searchsorted to locate the indices
sidx = np.argsort(k)
idx = np.searchsorted(k, search_keys.view(np.ndarray), sorter=sidx)
# Finally index and extract out the corresponding values
return np.take(v,sidx[idx])
def uniq(translation_dict, input_array):
uniq, inv_idx = np.unique(input_array, return_inverse=True)
return np.char.array([translation_dict[u] for u in uniq])[inv_idx]
# correctness
print(np.all(lookup(hash_12, translation_dict, mock_data)
== search_dic(translation_dict, mock_data)))
print(np.all(lookup(hash_12, translation_dict, mock_data)
== uniq(translation_dict, mock_data)))
# performance
print('C_Beauvel {:9.6f} secs'.format(timeit(lambda: uniq(
translation_dict, mock_data), number=10)/10))
print('Divakar {:9.6f} secs'.format(timeit(lambda: search_dic(
translation_dict, mock_data), number=10)/10))
print('PP {:9.6f} secs'.format(timeit(lambda: lookup(
hash_12, translation_dict, mock_data), number=10)/10))
打印:
True
True
C_Beauvel 0.622123 secs
Divakar 0.050903 secs
PP 0.011464 secs