使用 pd.Series returns NaN 添加列到 Pandas DataFrame 进行第一次迭代而不是字符串
Add column using pd.Series returns NaN to Pandas DataFrame for the first iteration instead of string
我正在尝试用来自多个 excel 文件的数据填充 pandas DataFrame。我想在 DataFrame 中添加一列 ('dr_nr'),其中包含所有 excel 文件的名称。我接近了,但这并不完全符合我的需要。有人可以帮我吗?
代码
#Files are read
dir_path = os.path.dirname(os.path.realpath("pythonfile"))
onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
#filenames are added to empty dataframe
global df
df = pd.DataFrame()
data = pd.DataFrame()
count = len(onlyfiles)
dr_nr = pd.DataFrame()
for x in range(0, count):
if onlyfiles[x].endswith("xlsx") or onlyfiles[x].endswith("xls") == True:
data = pd.DataFrame(pe.get_array(file_name=dir_path + '\' + onlyfiles[x]))
#this is where something goes wrong (Result 1)
data['dr_nr']= pd.Series(str(onlyfiles[x]), index=df.index)
#i tried replacing the line above with: (Result 2)
data['dr_nr']= pd.Series(str(onlyfiles[x]), index=None)
df = df.append(data, ignore_index=True)
结果 1:
0 1 2 dr_nr
0 A B C NaN
1 A B C NaN
2 A B C NaN
3 A B C NaN
4 A B C NaN
5 A B C File2.xlsx
6 A B C File2.xlsx
7 A B C File2.xlsx
8 A B C File2.xlsx
9 A B C File2.xlsx
10 A B C File3.xlsx
11 A B C File3.xlsx
12 A B C File3.xlsx
13 A B C File3.xlsx
14 A B C File3.xlsx
结果 2:
0 1 2 dr_nr
0 A B C File1.xlsx
1 A B C NaN
2 A B C NaN
3 A B C NaN
4 A B C NaN
5 A B C File2.xlsx
6 A B C NaN
7 A B C NaN
8 A B C NaN
9 A B C NaN
10 A B C File3.xlsx
11 A B C NaN
12 A B C NaN
13 A B C NaN
14 A B C NaN
期望的结果:
0 1 2 dr_nr
0 A B C File1.xlsx
1 A B C File1.xlsx
2 A B C File1.xlsx
3 A B C File1.xlsx
4 A B C File1.xlsx
5 A B C File2.xlsx
6 A B C File2.xlsx
7 A B C File2.xlsx
8 A B C File2.xlsx
9 A B C File2.xlsx
10 A B C File3.xlsx
11 A B C File3.xlsx
12 A B C File3.xlsx
13 A B C File3.xlsx
14 A B C File3.xlsx
其他尝试
dr_nr = dr_nr.append(onlyfiles[x], ignore_index=True)
data['dr_nr']= pd.Series(str(onlyfiles[x][0:19]), index=df.index).fillna(value='Test')
data = pd.DataFrame({'dr_nr' : len(data)}, index=pd.RangeIndex(start=0, stop=99, step=1))
data['dr_nr']= data['dr_nr'].fillna(value='test)
data['dr_nr']= pd.Series(str(onlyfiles[x][0:19]), ignore_index=True).fillna("test")
data['dr_nr'].fillna("test")
所以我似乎无法重现该问题。不幸的是,结果正是我想要的。
最小可重复样本
import pandas as pd
container = pd.DataFrame()
container2 = pd.DataFrame()
filenames = ['file1', 'file2', 'file3']
dataset1 = [['plastic', 5],['metal',3],['liquid',8]]
dataset2 = [['Dust', 2],['Rubber',1],['Fibres',9],['test',10]]
dataset3 = [['spam', 2],['eggs',1],['pickles',9]]
dataset4 = [dataset1, dataset2, dataset3]
df = pd.DataFrame(dataset1, columns=['material', 'quantity'])
df = pd.DataFrame(dataset2, columns=['material', 'quantity'])
df = pd.DataFrame(dataset3, columns=['material', 'quantity'])
for x in range(0,len(dataset4)):
container = pd.DataFrame(dataset4[x])
#container['filenames']= pd.Series(filenames[x], index = None)
container['filenames']= pd.Series(filenames[x], index=container.index)
container2 = container2.append(container, ignore_index=True)
print(container2)
输出
0 1 filenames
0 plastic 5 file1
1 metal 3 file1
2 liquid 8 file1
3 Dust 2 file2
4 Rubber 1 file2
5 Fibres 9 file2
6 test 10 file2
7 spam 2 file3
8 eggs 1 file3
9 pickles 9 file3
将文件名设置到列表中,然后遍历名称并收集 df。最后连接所有收集的 df。
import pandas as pd
files_list = ['data1.csv', 'data2.csv']
df_list = []
for file in files_list:
df = pd.read_csv(file)
df['dr_nr'] = df.apply(lambda x : file, axis=1)
df_list.append(df)
df_complete = pd.concat(df_list, ignore_index=True)
输出:
A B C dr_nr
0 1 1 1 data1.csv
1 2 2 2 data1.csv
2 3 3 3 data1.csv
3 4 4 4 data2.csv
4 5 5 5 data2.csv
5 6 6 6 data2.csv
我正在尝试用来自多个 excel 文件的数据填充 pandas DataFrame。我想在 DataFrame 中添加一列 ('dr_nr'),其中包含所有 excel 文件的名称。我接近了,但这并不完全符合我的需要。有人可以帮我吗?
代码
#Files are read
dir_path = os.path.dirname(os.path.realpath("pythonfile"))
onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
#filenames are added to empty dataframe
global df
df = pd.DataFrame()
data = pd.DataFrame()
count = len(onlyfiles)
dr_nr = pd.DataFrame()
for x in range(0, count):
if onlyfiles[x].endswith("xlsx") or onlyfiles[x].endswith("xls") == True:
data = pd.DataFrame(pe.get_array(file_name=dir_path + '\' + onlyfiles[x]))
#this is where something goes wrong (Result 1)
data['dr_nr']= pd.Series(str(onlyfiles[x]), index=df.index)
#i tried replacing the line above with: (Result 2)
data['dr_nr']= pd.Series(str(onlyfiles[x]), index=None)
df = df.append(data, ignore_index=True)
结果 1:
0 1 2 dr_nr
0 A B C NaN
1 A B C NaN
2 A B C NaN
3 A B C NaN
4 A B C NaN
5 A B C File2.xlsx
6 A B C File2.xlsx
7 A B C File2.xlsx
8 A B C File2.xlsx
9 A B C File2.xlsx
10 A B C File3.xlsx
11 A B C File3.xlsx
12 A B C File3.xlsx
13 A B C File3.xlsx
14 A B C File3.xlsx
结果 2:
0 1 2 dr_nr
0 A B C File1.xlsx
1 A B C NaN
2 A B C NaN
3 A B C NaN
4 A B C NaN
5 A B C File2.xlsx
6 A B C NaN
7 A B C NaN
8 A B C NaN
9 A B C NaN
10 A B C File3.xlsx
11 A B C NaN
12 A B C NaN
13 A B C NaN
14 A B C NaN
期望的结果:
0 1 2 dr_nr
0 A B C File1.xlsx
1 A B C File1.xlsx
2 A B C File1.xlsx
3 A B C File1.xlsx
4 A B C File1.xlsx
5 A B C File2.xlsx
6 A B C File2.xlsx
7 A B C File2.xlsx
8 A B C File2.xlsx
9 A B C File2.xlsx
10 A B C File3.xlsx
11 A B C File3.xlsx
12 A B C File3.xlsx
13 A B C File3.xlsx
14 A B C File3.xlsx
其他尝试
dr_nr = dr_nr.append(onlyfiles[x], ignore_index=True)
data['dr_nr']= pd.Series(str(onlyfiles[x][0:19]), index=df.index).fillna(value='Test')
data = pd.DataFrame({'dr_nr' : len(data)}, index=pd.RangeIndex(start=0, stop=99, step=1))
data['dr_nr']= data['dr_nr'].fillna(value='test)
data['dr_nr']= pd.Series(str(onlyfiles[x][0:19]), ignore_index=True).fillna("test")
data['dr_nr'].fillna("test")
所以我似乎无法重现该问题。不幸的是,结果正是我想要的。
最小可重复样本
import pandas as pd
container = pd.DataFrame()
container2 = pd.DataFrame()
filenames = ['file1', 'file2', 'file3']
dataset1 = [['plastic', 5],['metal',3],['liquid',8]]
dataset2 = [['Dust', 2],['Rubber',1],['Fibres',9],['test',10]]
dataset3 = [['spam', 2],['eggs',1],['pickles',9]]
dataset4 = [dataset1, dataset2, dataset3]
df = pd.DataFrame(dataset1, columns=['material', 'quantity'])
df = pd.DataFrame(dataset2, columns=['material', 'quantity'])
df = pd.DataFrame(dataset3, columns=['material', 'quantity'])
for x in range(0,len(dataset4)):
container = pd.DataFrame(dataset4[x])
#container['filenames']= pd.Series(filenames[x], index = None)
container['filenames']= pd.Series(filenames[x], index=container.index)
container2 = container2.append(container, ignore_index=True)
print(container2)
输出
0 1 filenames
0 plastic 5 file1
1 metal 3 file1
2 liquid 8 file1
3 Dust 2 file2
4 Rubber 1 file2
5 Fibres 9 file2
6 test 10 file2
7 spam 2 file3
8 eggs 1 file3
9 pickles 9 file3
将文件名设置到列表中,然后遍历名称并收集 df。最后连接所有收集的 df。
import pandas as pd
files_list = ['data1.csv', 'data2.csv']
df_list = []
for file in files_list:
df = pd.read_csv(file)
df['dr_nr'] = df.apply(lambda x : file, axis=1)
df_list.append(df)
df_complete = pd.concat(df_list, ignore_index=True)
输出:
A B C dr_nr
0 1 1 1 data1.csv
1 2 2 2 data1.csv
2 3 3 3 data1.csv
3 4 4 4 data2.csv
4 5 5 5 data2.csv
5 6 6 6 data2.csv