多字符串列映射和清理
Multiple string column mapping and cleaning
我有这样一个数据列:
df['zone'].unique()
out[4]:
array(['BROOKLYN', 'BRONX', '07 BRONX', 'Unspecified', '05 BRONX',
'QUEENS', 'MANHATTAN', '07 MANHATTAN', 'STATEN ISLAND',
'17 BROOKLYN', '0 Unspecified', 'Unspecified MANHATTAN',
'12 BROOKLYN', '07 BROOKLYN', '09 MANHATTAN', '01 STATEN ISLAND',
'12 MANHATTAN', '04 QUEENS', '06 BROOKLYN',
'01/04/2016 01:45:00 PM', '01/02/2016 05:43:34 AM', '07 QUEENS',
'11 BRONX', '01/04/2016 03:45:00 PM', '10 MANHATTAN', '03 BRONX',
'04 BRONX', ' or 311 Online."', '01/13/2016 12:00:00 AM',
'04 BROOKLYN', '03 BROOKLYN', '01 QUEENS',
'01/04/2016 03:34:55 PM', '08 MANHATTAN', '14 BROOKLYN',
'10 QUEENS', 'Unspecified STATEN ISLAND', '02 BRONX', '09 BRONX',
'08 QUEENS', '10 BRONX', '03 MANHATTAN', '12 QUEENS',
' please call (212) NEW-YORK (212-639-9675)."',
'Unspecified BROOKLYN', '01/11/2016 04:45:00 PM', '04 MANHATTAN',
'01 BRONX', '09 BROOKLYN', '01/05/2016 07:00:00 AM', '18 BROOKLYN',
'01/08/2016 09:00:00 AM', '01 BROOKLYN', '06 BRONX',
'01 MANHATTAN', '01/06/2016 12:15:00 PM', '02/04/2016 08:45:00 PM',
'01/05/2016 12:45:00 PM', ' no action was taken."', '05 BROOKLYN',
'08 BROOKLYN', 'Unspecified QUEENS', '01/08/2016 03:00:00 PM',
'08/22/2016 12:00:00 AM', '13 BROOKLYN', '02 QUEENS', '14 QUEENS',
'01/05/2016 08:45:00 AM', '11 QUEENS', '02 MANHATTAN',
'01/08/2016 10:05:00 AM', '01/05/2016 01:05:00 PM',
'Unspecified BRONX', '06 QUEENS', '09 QUEENS', '15 BROOKLYN',
'01/07/2016 09:25:00 AM', '02 STATEN ISLAND',
'01/02/2016 12:00:00 PM', '01/06/2016 08:45:00 PM',
'04/04/2016 12:00:00 AM', '01/06/2016 08:30:00 AM'])
如您所见,我那里有很多混合类型,所有内容都被 pandas 归类为字符串对象。我已经在 pd.read_csv
命令中尝试了一些参数,例如 low_memory = False
、chunksize
等...但没有成功。
我在这里真正需要做的是将此列映射为以下格式:
(Manhattan -> 1, Brooklyn -> 2, Queens -> 3, Staten Island -> 4, Bronx -> 5, Other -> 0)
我还需要将字符串“07 BRONX”包含为 bronx,而不是其他或未知。
我考虑过 .map()
方法,但由于该列是混合类型的真正混乱,我不确定我的选择是什么。
如果有任何建议,我将不胜感激。
非常感谢
创建字典以将 extract
keys of dictionary with |
for OR
by map
, last fillna
所有不匹配的值映射到 0
:
a = np.array(['BROOKLYN', 'BRONX', '07 BRONX', 'Unspecified', '05 BRONX',
'QUEENS', 'MANHATTAN', '07 MANHATTAN', 'STATEN ISLAND',
'17 BROOKLYN', '0 Unspecified', 'Unspecified MANHATTAN',
'12 BROOKLYN', '07 BROOKLYN', '09 MANHATTAN', '01 STATEN ISLAND',
'12 MANHATTAN', '04 QUEENS', '06 BROOKLYN',
'01/04/2016 01:45:00 PM', '01/02/2016 05:43:34 AM', '07 QUEENS',
'11 BRONX', '01/04/2016 03:45:00 PM', '10 MANHATTAN', '03 BRONX',
'04 BRONX', ' or 311 Online."', '01/13/2016 12:00:00 AM',
'04 BROOKLYN', '03 BROOKLYN', '01 QUEENS',
'01/04/2016 03:34:55 PM', '08 MANHATTAN', '14 BROOKLYN',
'10 QUEENS', 'Unspecified STATEN ISLAND', '02 BRONX', '09 BRONX',
'08 QUEENS', '10 BRONX', '03 MANHATTAN', '12 QUEENS',
' please call (212) NEW-YORK (212-639-9675)."',
'Unspecified BROOKLYN', '01/11/2016 04:45:00 PM', '04 MANHATTAN',
'01 BRONX', '09 BROOKLYN', '01/05/2016 07:00:00 AM', '18 BROOKLYN',
'01/08/2016 09:00:00 AM', '01 BROOKLYN', '06 BRONX',
'01 MANHATTAN', '01/06/2016 12:15:00 PM', '02/04/2016 08:45:00 PM',
'01/05/2016 12:45:00 PM', ' no action was taken."', '05 BROOKLYN',
'08 BROOKLYN', 'Unspecified QUEENS', '01/08/2016 03:00:00 PM',
'08/22/2016 12:00:00 AM', '13 BROOKLYN', '02 QUEENS', '14 QUEENS',
'01/05/2016 08:45:00 AM', '11 QUEENS', '02 MANHATTAN',
'01/08/2016 10:05:00 AM', '01/05/2016 01:05:00 PM',
'Unspecified BRONX', '06 QUEENS', '09 QUEENS', '15 BROOKLYN',
'01/07/2016 09:25:00 AM', '02 STATEN ISLAND',
'01/02/2016 12:00:00 PM', '01/06/2016 08:45:00 PM',
'04/04/2016 12:00:00 AM', '01/06/2016 08:30:00 AM'])
df=pd.DataFrame({ 'zone':a })
d = {'MANHATTAN':1, 'BROOKLYN':2, 'QUEENS' : 3, 'STATEN ISLAND' : 4, 'BRONX' : 5}
pat = '(' + '|'.join(d.keys()) + ')'
df['code'] = df['zone'].str.extract(pat, expand=False).map(d).fillna(0, downcast='int')
print (df.head(10))
zone code
0 BROOKLYN 2
1 BRONX 5
2 07 BRONX 5
3 Unspecified 0
4 05 BRONX 5
5 QUEENS 3
6 MANHATTAN 1
7 07 MANHATTAN 1
8 STATEN ISLAND 4
9 17 BROOKLYN 2
我有这样一个数据列:
df['zone'].unique()
out[4]:
array(['BROOKLYN', 'BRONX', '07 BRONX', 'Unspecified', '05 BRONX',
'QUEENS', 'MANHATTAN', '07 MANHATTAN', 'STATEN ISLAND',
'17 BROOKLYN', '0 Unspecified', 'Unspecified MANHATTAN',
'12 BROOKLYN', '07 BROOKLYN', '09 MANHATTAN', '01 STATEN ISLAND',
'12 MANHATTAN', '04 QUEENS', '06 BROOKLYN',
'01/04/2016 01:45:00 PM', '01/02/2016 05:43:34 AM', '07 QUEENS',
'11 BRONX', '01/04/2016 03:45:00 PM', '10 MANHATTAN', '03 BRONX',
'04 BRONX', ' or 311 Online."', '01/13/2016 12:00:00 AM',
'04 BROOKLYN', '03 BROOKLYN', '01 QUEENS',
'01/04/2016 03:34:55 PM', '08 MANHATTAN', '14 BROOKLYN',
'10 QUEENS', 'Unspecified STATEN ISLAND', '02 BRONX', '09 BRONX',
'08 QUEENS', '10 BRONX', '03 MANHATTAN', '12 QUEENS',
' please call (212) NEW-YORK (212-639-9675)."',
'Unspecified BROOKLYN', '01/11/2016 04:45:00 PM', '04 MANHATTAN',
'01 BRONX', '09 BROOKLYN', '01/05/2016 07:00:00 AM', '18 BROOKLYN',
'01/08/2016 09:00:00 AM', '01 BROOKLYN', '06 BRONX',
'01 MANHATTAN', '01/06/2016 12:15:00 PM', '02/04/2016 08:45:00 PM',
'01/05/2016 12:45:00 PM', ' no action was taken."', '05 BROOKLYN',
'08 BROOKLYN', 'Unspecified QUEENS', '01/08/2016 03:00:00 PM',
'08/22/2016 12:00:00 AM', '13 BROOKLYN', '02 QUEENS', '14 QUEENS',
'01/05/2016 08:45:00 AM', '11 QUEENS', '02 MANHATTAN',
'01/08/2016 10:05:00 AM', '01/05/2016 01:05:00 PM',
'Unspecified BRONX', '06 QUEENS', '09 QUEENS', '15 BROOKLYN',
'01/07/2016 09:25:00 AM', '02 STATEN ISLAND',
'01/02/2016 12:00:00 PM', '01/06/2016 08:45:00 PM',
'04/04/2016 12:00:00 AM', '01/06/2016 08:30:00 AM'])
如您所见,我那里有很多混合类型,所有内容都被 pandas 归类为字符串对象。我已经在 pd.read_csv
命令中尝试了一些参数,例如 low_memory = False
、chunksize
等...但没有成功。
我在这里真正需要做的是将此列映射为以下格式:
(Manhattan -> 1, Brooklyn -> 2, Queens -> 3, Staten Island -> 4, Bronx -> 5, Other -> 0)
我还需要将字符串“07 BRONX”包含为 bronx,而不是其他或未知。
我考虑过 .map()
方法,但由于该列是混合类型的真正混乱,我不确定我的选择是什么。
如果有任何建议,我将不胜感激。
非常感谢
创建字典以将 extract
keys of dictionary with |
for OR
by map
, last fillna
所有不匹配的值映射到 0
:
a = np.array(['BROOKLYN', 'BRONX', '07 BRONX', 'Unspecified', '05 BRONX',
'QUEENS', 'MANHATTAN', '07 MANHATTAN', 'STATEN ISLAND',
'17 BROOKLYN', '0 Unspecified', 'Unspecified MANHATTAN',
'12 BROOKLYN', '07 BROOKLYN', '09 MANHATTAN', '01 STATEN ISLAND',
'12 MANHATTAN', '04 QUEENS', '06 BROOKLYN',
'01/04/2016 01:45:00 PM', '01/02/2016 05:43:34 AM', '07 QUEENS',
'11 BRONX', '01/04/2016 03:45:00 PM', '10 MANHATTAN', '03 BRONX',
'04 BRONX', ' or 311 Online."', '01/13/2016 12:00:00 AM',
'04 BROOKLYN', '03 BROOKLYN', '01 QUEENS',
'01/04/2016 03:34:55 PM', '08 MANHATTAN', '14 BROOKLYN',
'10 QUEENS', 'Unspecified STATEN ISLAND', '02 BRONX', '09 BRONX',
'08 QUEENS', '10 BRONX', '03 MANHATTAN', '12 QUEENS',
' please call (212) NEW-YORK (212-639-9675)."',
'Unspecified BROOKLYN', '01/11/2016 04:45:00 PM', '04 MANHATTAN',
'01 BRONX', '09 BROOKLYN', '01/05/2016 07:00:00 AM', '18 BROOKLYN',
'01/08/2016 09:00:00 AM', '01 BROOKLYN', '06 BRONX',
'01 MANHATTAN', '01/06/2016 12:15:00 PM', '02/04/2016 08:45:00 PM',
'01/05/2016 12:45:00 PM', ' no action was taken."', '05 BROOKLYN',
'08 BROOKLYN', 'Unspecified QUEENS', '01/08/2016 03:00:00 PM',
'08/22/2016 12:00:00 AM', '13 BROOKLYN', '02 QUEENS', '14 QUEENS',
'01/05/2016 08:45:00 AM', '11 QUEENS', '02 MANHATTAN',
'01/08/2016 10:05:00 AM', '01/05/2016 01:05:00 PM',
'Unspecified BRONX', '06 QUEENS', '09 QUEENS', '15 BROOKLYN',
'01/07/2016 09:25:00 AM', '02 STATEN ISLAND',
'01/02/2016 12:00:00 PM', '01/06/2016 08:45:00 PM',
'04/04/2016 12:00:00 AM', '01/06/2016 08:30:00 AM'])
df=pd.DataFrame({ 'zone':a })
d = {'MANHATTAN':1, 'BROOKLYN':2, 'QUEENS' : 3, 'STATEN ISLAND' : 4, 'BRONX' : 5}
pat = '(' + '|'.join(d.keys()) + ')'
df['code'] = df['zone'].str.extract(pat, expand=False).map(d).fillna(0, downcast='int')
print (df.head(10))
zone code
0 BROOKLYN 2
1 BRONX 5
2 07 BRONX 5
3 Unspecified 0
4 05 BRONX 5
5 QUEENS 3
6 MANHATTAN 1
7 07 MANHATTAN 1
8 STATEN ISLAND 4
9 17 BROOKLYN 2