映射数据帧不是系列 pandas
mapping dataframes not series pandas
我是 pandas 的新手,我正在尝试映射多列而不是一列。 This page 向我展示了如何使用 pd.Series
,但我不知道如何映射多个 columns
。
这是我的两个 DataFrames
我正在尝试 map
。
data2=pd.DataFrame(np.random.randn(5,2),index=range(0,5),columns=['x','y'])
data2['Cluster']=['A','B','A','B','C']
centers2=pd.DataFrame(np.random.randint(0,10,size=(3,2)),index= ['A','B','C'],columns=['x','y'])
此处 data2
看起来像:
data2
x y Cluster
0 0.151212 -0.168855 A
1 -0.078935 1.933378 B
2 -0.388903 0.444610 A
3 0.622089 1.609730 B
4 -0.346856 1.095834 C
和 centers2
看起来像:
centers2
x y
A 6 4
B 6 0
C 4 1
我希望在 data2
中创建两个单独的列,并使用适当的 center2
匹配。这是我的手动尝试
data2['Centers.x']=[6,6,6,6,4]
data2['Centers.y']=[4,0,4,0,1]
data2
x y Cluster Centers.x Centers.y
0 0.151212 -0.168855 A 6 4
1 -0.078935 1.933378 B 6 0
2 -0.388903 0.444610 A 6 4
3 0.622089 1.609730 B 6 0
4 -0.346856 1.095834 C 4 1
如何使用 map
函数执行此操作? (我知道如何使用循环来做到这一点,我需要一个矢量化的解决方案。)
print pd.concat([data2.x, data2.y,
data2.Cluster,
data2.Cluster.map(centers2.x.to_dict()),
data2.Cluster.map(centers2.y.to_dict())],
axis=1,
keys=['x','y','Cluster','Centers.x','Centers.y'])
x y Cluster Centers.x Centers.y
0 -0.247322 -0.699005 A 6 5
1 -0.026692 0.551841 B 1 4
2 -1.730480 -0.170510 A 6 5
3 0.814357 -0.204729 B 1 4
4 2.387925 -0.503993 C 1 0
的解决方案
print data2.join(centers2, on='Cluster', rsuffix ='_centers')
x y Cluster x_centers y_centers
0 -0.247322 -0.699005 A 6 5
1 -0.026692 0.551841 B 1 4
2 -1.730480 -0.170510 A 6 5
3 0.814357 -0.204729 B 1 4
4 2.387925 -0.503993 C 1 0
另一种解决方案merge
与join
相同,但增加了2
个参数:
print data2.merge(centers2,
left_on='Cluster',
right_index=True,
suffixes=['', '_centers'],
sort=False,
how='left')
时间:
len(df)=5k
:
data2 = pd.concat([data2]*1000).reset_index(drop=True)
def root(data2, centers2):
data2['Centers.x'] = data2.apply(lambda row: centers2.get_value(row['Cluster'], 'x'), axis=1)
data2['Centers.y'] = data2.apply(lambda row: centers2.get_value(row['Cluster'], 'y'), axis=1)
return data2
In [117]: %timeit root(data2, centers2)
1 loops, best of 3: 267 ms per loop
In [118]: %timeit data2.merge(centers2, left_on='Cluster', right_index=True, suffixes=['', '_centers'], sort=False, how='left')
1000 loops, best of 3: 1.71 ms per loop
In [119]: %timeit data2.join(centers2, on='Cluster', rsuffix ='_centers', sort=False, how='left')
1000 loops, best of 3: 1.71 ms per loop
In [120]: %timeit pd.concat([data2.x, data2.y, data2.Cluster, data2.Cluster.map(centers2.x.to_dict()), data2.Cluster.map(centers2.y.to_dict())], axis=1, keys=['x','y','Cluster','Centers.x','Centers.y'])
100 loops, best of 3: 2.15 ms per loop
In [121]: %timeit data2.merge(centers2, left_on='Cluster', right_index=True, suffixes=['', '_centers']).sort_index()
100 loops, best of 3: 2.68 ms per loop
对于 pd.DataFrame
,.merge()
最接近 pd.Series.map()
。您可以使用 suffixes=[]
关键字将自定义 header 添加到重叠列,例如 suffices=['', '_centers']
。
注意 pd.Series
没有 .merge()
,并且 pd.DataFrame
没有 .map()
。
与
data2
x y Cluster
0 -1.406449 -0.244859 A
1 1.002103 0.214346 B
2 0.353894 0.353995 A
3 1.249199 -0.661904 B
4 0.623962 -1.754789 C
centers2
x y
A 0 9
B 6 9
C 0 6
你得到:
data2.merge(centers2, left_on='Cluster', right_index=True, suffixes=['', '_centers']).sort_index()
x y Cluster x_centers y_centers
0 -1.406449 -0.244859 A 0 9
1 1.002103 0.214346 B 6 9
2 0.353894 0.353995 A 0 9
3 1.249199 -0.661904 B 6 9
4 0.623962 -1.754789 C 0 6
还有 .join()
选项,这是访问 .merge()
或 pd.concat()
的另一种方式,如果 .merge()
在 index
上 [= =27=] - 来自来源:
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
rsuffix=rsuffix, sort=sort)
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
from pandas.tools.merge import merge, concat
if isinstance(other, Series):
if other.name is None:
raise ValueError('Other Series must have a name')
other = DataFrame({other.name: other})
if isinstance(other, DataFrame):
return merge(self, other, left_on=on, how=how,
left_index=on is None, right_index=True,
suffixes=(lsuffix, rsuffix), sort=sort)
else:
if on is not None:
raise ValueError('Joining multiple DataFrames only supported'
' for joining on index')
我是 pandas 的新手,我正在尝试映射多列而不是一列。 This page 向我展示了如何使用 pd.Series
,但我不知道如何映射多个 columns
。
这是我的两个 DataFrames
我正在尝试 map
。
data2=pd.DataFrame(np.random.randn(5,2),index=range(0,5),columns=['x','y'])
data2['Cluster']=['A','B','A','B','C']
centers2=pd.DataFrame(np.random.randint(0,10,size=(3,2)),index= ['A','B','C'],columns=['x','y'])
此处 data2
看起来像:
data2
x y Cluster
0 0.151212 -0.168855 A
1 -0.078935 1.933378 B
2 -0.388903 0.444610 A
3 0.622089 1.609730 B
4 -0.346856 1.095834 C
和 centers2
看起来像:
centers2
x y
A 6 4
B 6 0
C 4 1
我希望在 data2
中创建两个单独的列,并使用适当的 center2
匹配。这是我的手动尝试
data2['Centers.x']=[6,6,6,6,4]
data2['Centers.y']=[4,0,4,0,1]
data2
x y Cluster Centers.x Centers.y
0 0.151212 -0.168855 A 6 4
1 -0.078935 1.933378 B 6 0
2 -0.388903 0.444610 A 6 4
3 0.622089 1.609730 B 6 0
4 -0.346856 1.095834 C 4 1
如何使用 map
函数执行此操作? (我知道如何使用循环来做到这一点,我需要一个矢量化的解决方案。)
print pd.concat([data2.x, data2.y,
data2.Cluster,
data2.Cluster.map(centers2.x.to_dict()),
data2.Cluster.map(centers2.y.to_dict())],
axis=1,
keys=['x','y','Cluster','Centers.x','Centers.y'])
x y Cluster Centers.x Centers.y
0 -0.247322 -0.699005 A 6 5
1 -0.026692 0.551841 B 1 4
2 -1.730480 -0.170510 A 6 5
3 0.814357 -0.204729 B 1 4
4 2.387925 -0.503993 C 1 0
的解决方案
print data2.join(centers2, on='Cluster', rsuffix ='_centers')
x y Cluster x_centers y_centers
0 -0.247322 -0.699005 A 6 5
1 -0.026692 0.551841 B 1 4
2 -1.730480 -0.170510 A 6 5
3 0.814357 -0.204729 B 1 4
4 2.387925 -0.503993 C 1 0
另一种解决方案merge
与join
相同,但增加了2
个参数:
print data2.merge(centers2,
left_on='Cluster',
right_index=True,
suffixes=['', '_centers'],
sort=False,
how='left')
时间:
len(df)=5k
:
data2 = pd.concat([data2]*1000).reset_index(drop=True)
def root(data2, centers2):
data2['Centers.x'] = data2.apply(lambda row: centers2.get_value(row['Cluster'], 'x'), axis=1)
data2['Centers.y'] = data2.apply(lambda row: centers2.get_value(row['Cluster'], 'y'), axis=1)
return data2
In [117]: %timeit root(data2, centers2)
1 loops, best of 3: 267 ms per loop
In [118]: %timeit data2.merge(centers2, left_on='Cluster', right_index=True, suffixes=['', '_centers'], sort=False, how='left')
1000 loops, best of 3: 1.71 ms per loop
In [119]: %timeit data2.join(centers2, on='Cluster', rsuffix ='_centers', sort=False, how='left')
1000 loops, best of 3: 1.71 ms per loop
In [120]: %timeit pd.concat([data2.x, data2.y, data2.Cluster, data2.Cluster.map(centers2.x.to_dict()), data2.Cluster.map(centers2.y.to_dict())], axis=1, keys=['x','y','Cluster','Centers.x','Centers.y'])
100 loops, best of 3: 2.15 ms per loop
In [121]: %timeit data2.merge(centers2, left_on='Cluster', right_index=True, suffixes=['', '_centers']).sort_index()
100 loops, best of 3: 2.68 ms per loop
pd.DataFrame
,.merge()
最接近 pd.Series.map()
。您可以使用 suffixes=[]
关键字将自定义 header 添加到重叠列,例如 suffices=['', '_centers']
。
注意 pd.Series
没有 .merge()
,并且 pd.DataFrame
没有 .map()
。
与
data2
x y Cluster
0 -1.406449 -0.244859 A
1 1.002103 0.214346 B
2 0.353894 0.353995 A
3 1.249199 -0.661904 B
4 0.623962 -1.754789 C
centers2
x y
A 0 9
B 6 9
C 0 6
你得到:
data2.merge(centers2, left_on='Cluster', right_index=True, suffixes=['', '_centers']).sort_index()
x y Cluster x_centers y_centers
0 -1.406449 -0.244859 A 0 9
1 1.002103 0.214346 B 6 9
2 0.353894 0.353995 A 0 9
3 1.249199 -0.661904 B 6 9
4 0.623962 -1.754789 C 0 6
还有 .join()
选项,这是访问 .merge()
或 pd.concat()
的另一种方式,如果 .merge()
在 index
上 [= =27=] - 来自来源:
def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
rsuffix=rsuffix, sort=sort)
def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
from pandas.tools.merge import merge, concat
if isinstance(other, Series):
if other.name is None:
raise ValueError('Other Series must have a name')
other = DataFrame({other.name: other})
if isinstance(other, DataFrame):
return merge(self, other, left_on=on, how=how,
left_index=on is None, right_index=True,
suffixes=(lsuffix, rsuffix), sort=sort)
else:
if on is not None:
raise ValueError('Joining multiple DataFrames only supported'
' for joining on index')