我该如何处理这个乱七八糟的 df?
How do i wrangle this messy df?
如何将这些杂乱的数据帧从字符串转换为底部建议的数据帧?
模块
import pandas as pad
import io
字符串数据
test = '\tStart\t \t \tStop\n\t12/12/20 1400\t\t\nrugby \n\t12/16/20 1359\n\t12/12/20 1300\t\t\nsoccer \n\t12/19/20 0859\n\t12/12/20 1300\t\t\nbasketball \n\t12/19/20 0659\n\n\n\n'
变成pandas数据帧
test_raw = io.StringIO(test)
test_df = pad.read_csv(test, sep='\t')
test_df
Unnamed: 0 Start .1 Stop
0 NaN 12/12/20 1400 NaN NaN NaN
1 rugby NaN NaN NaN NaN
2 NaN 12/16/20 1359 NaN NaN NaN
3 NaN 12/12/20 1300 NaN NaN NaN
4 soccer NaN NaN NaN NaN
5 NaN 12/19/20 0859 NaN NaN NaN
6 NaN 12/12/20 1300 NaN NaN NaN
7 basketball NaN NaN NaN NaN
8 NaN 12/19/20 0659 NaN NaN NaN
如何把它变成:
Start Sport Stop
0 12/12/20 1400 rugby 12/16/20 1359
1 12/12/20 1300 soccer 12/19/20 0859
2 12/12/20 1300 basketball 12/19/20 0659
提前谢谢@
import re
import pandas as pd
import numpy as np
test = '\tStart\t \t \tStop\n\t12/12/20 1400\t\t\nrugby \n\t12/16/20 1359\n\t12/12/20 1300\t\t\nsoccer \n\t12/19/20 0859\n\t12/12/20 1300\t\t\nbasketball \n\t12/19/20 0659\n\n\n\n'
# step1 split test by \n
alist = test.split('\n')
# ['\tStart\t \t \tStop',
# '\t12/12/20 1400\t\t',
# 'rugby ',
# '\t12/16/20 1359',
# '\t12/12/20 1300\t\t',
# 'soccer ',
# '\t12/19/20 0859',
# '\t12/12/20 1300\t\t',
# 'basketball ',
# '\t12/19/20 0659',
# '',
# '',
# '',
# '']
# step2 we can see that every row has a location index rule
# 1. Start with index of 1, 4, ..., 1+3n
# 2. Sport with index of 2, 5, ..., 2+3n
# 3. Stop with index of 3, 6, ..., 3+3n
Start_col1 = alist[1::3] # -> ['\t12/12/20 1400\t\t', '\t12/12/20 1300\t\t', '\t12/12/20 1300\t\t', '', '']
Sport_col2 = alist[2::3] # -> ['rugby ', 'soccer ', 'basketball ', '']
Stop_col3 = alist[3::3] # -> ['\t12/16/20 1359', '\t12/19/20 0859', '\t12/19/20 0659', '']
# step3 use zip to combine the same location index in a tuple
blist = list(zip(Start_col1, Sport_col2, Stop_col3))
# [('\t12/12/20 1400\t\t', 'rugby ', '\t12/16/20 1359'),
# ('\t12/12/20 1300\t\t', 'soccer ', '\t12/19/20 0859'),
# ('\t12/12/20 1300\t\t', 'basketball ', '\t12/19/20 0659'),
# ('', '', '')]
# step4 convert to dataframe
dfn = pd.DataFrame(blist)
print(dfn)
# 0 1 2
# 0 \t12/12/20 1400\t\t rugby \t12/16/20 1359
# 1 \t12/12/20 1300\t\t soccer \t12/19/20 0859
# 2 \t12/12/20 1300\t\t basketball \t12/19/20 0659
# 3
# step5 strip space
dfn = dfn.applymap(str.strip)
print(dfn)
# 0 1 2
# 0 12/12/20 1400 rugby 12/16/20 1359
# 1 12/12/20 1300 soccer 12/19/20 0859
# 2 12/12/20 1300 basketball 12/19/20 0659
# 3
# step6 delete null rows
cond = dfn[1] == ''
dfn = dfn[~cond]
print(dfn)
# 0 1 2
# 0 12/12/20 1400 rugby 12/16/20 1359
# 1 12/12/20 1300 soccer 12/19/20 0859
# 2 12/12/20 1300 basketball 12/19/20 0659
# step7 set columns of the DataFrame
dfn.columns = ['Start', 'Sport', 'Stop']
如何将这些杂乱的数据帧从字符串转换为底部建议的数据帧?
模块
import pandas as pad
import io
字符串数据
test = '\tStart\t \t \tStop\n\t12/12/20 1400\t\t\nrugby \n\t12/16/20 1359\n\t12/12/20 1300\t\t\nsoccer \n\t12/19/20 0859\n\t12/12/20 1300\t\t\nbasketball \n\t12/19/20 0659\n\n\n\n'
变成pandas数据帧
test_raw = io.StringIO(test)
test_df = pad.read_csv(test, sep='\t')
test_df
Unnamed: 0 Start .1 Stop
0 NaN 12/12/20 1400 NaN NaN NaN
1 rugby NaN NaN NaN NaN
2 NaN 12/16/20 1359 NaN NaN NaN
3 NaN 12/12/20 1300 NaN NaN NaN
4 soccer NaN NaN NaN NaN
5 NaN 12/19/20 0859 NaN NaN NaN
6 NaN 12/12/20 1300 NaN NaN NaN
7 basketball NaN NaN NaN NaN
8 NaN 12/19/20 0659 NaN NaN NaN
如何把它变成:
Start Sport Stop
0 12/12/20 1400 rugby 12/16/20 1359
1 12/12/20 1300 soccer 12/19/20 0859
2 12/12/20 1300 basketball 12/19/20 0659
提前谢谢@
import re
import pandas as pd
import numpy as np
test = '\tStart\t \t \tStop\n\t12/12/20 1400\t\t\nrugby \n\t12/16/20 1359\n\t12/12/20 1300\t\t\nsoccer \n\t12/19/20 0859\n\t12/12/20 1300\t\t\nbasketball \n\t12/19/20 0659\n\n\n\n'
# step1 split test by \n
alist = test.split('\n')
# ['\tStart\t \t \tStop',
# '\t12/12/20 1400\t\t',
# 'rugby ',
# '\t12/16/20 1359',
# '\t12/12/20 1300\t\t',
# 'soccer ',
# '\t12/19/20 0859',
# '\t12/12/20 1300\t\t',
# 'basketball ',
# '\t12/19/20 0659',
# '',
# '',
# '',
# '']
# step2 we can see that every row has a location index rule
# 1. Start with index of 1, 4, ..., 1+3n
# 2. Sport with index of 2, 5, ..., 2+3n
# 3. Stop with index of 3, 6, ..., 3+3n
Start_col1 = alist[1::3] # -> ['\t12/12/20 1400\t\t', '\t12/12/20 1300\t\t', '\t12/12/20 1300\t\t', '', '']
Sport_col2 = alist[2::3] # -> ['rugby ', 'soccer ', 'basketball ', '']
Stop_col3 = alist[3::3] # -> ['\t12/16/20 1359', '\t12/19/20 0859', '\t12/19/20 0659', '']
# step3 use zip to combine the same location index in a tuple
blist = list(zip(Start_col1, Sport_col2, Stop_col3))
# [('\t12/12/20 1400\t\t', 'rugby ', '\t12/16/20 1359'),
# ('\t12/12/20 1300\t\t', 'soccer ', '\t12/19/20 0859'),
# ('\t12/12/20 1300\t\t', 'basketball ', '\t12/19/20 0659'),
# ('', '', '')]
# step4 convert to dataframe
dfn = pd.DataFrame(blist)
print(dfn)
# 0 1 2
# 0 \t12/12/20 1400\t\t rugby \t12/16/20 1359
# 1 \t12/12/20 1300\t\t soccer \t12/19/20 0859
# 2 \t12/12/20 1300\t\t basketball \t12/19/20 0659
# 3
# step5 strip space
dfn = dfn.applymap(str.strip)
print(dfn)
# 0 1 2
# 0 12/12/20 1400 rugby 12/16/20 1359
# 1 12/12/20 1300 soccer 12/19/20 0859
# 2 12/12/20 1300 basketball 12/19/20 0659
# 3
# step6 delete null rows
cond = dfn[1] == ''
dfn = dfn[~cond]
print(dfn)
# 0 1 2
# 0 12/12/20 1400 rugby 12/16/20 1359
# 1 12/12/20 1300 soccer 12/19/20 0859
# 2 12/12/20 1300 basketball 12/19/20 0659
# step7 set columns of the DataFrame
dfn.columns = ['Start', 'Sport', 'Stop']