使用 pd.Series returns NaN 添加列到 Pandas DataFrame 进行第一次迭代而不是字符串

Add column using pd.Series returns NaN to Pandas DataFrame for the first iteration instead of string

我正在尝试用来自多个 excel 文件的数据填充 pandas DataFrame。我想在 DataFrame 中添加一列 ('dr_nr'),其中包含所有 excel 文件的名称。我接近了,但这并不完全符合我的需要。有人可以帮我吗?

代码

#Files are read
dir_path = os.path.dirname(os.path.realpath("pythonfile"))
onlyfiles = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]

#filenames are added to empty dataframe
global df
df = pd.DataFrame()
data = pd.DataFrame()
count = len(onlyfiles)
dr_nr = pd.DataFrame()

for x in range(0, count):
    if onlyfiles[x].endswith("xlsx") or onlyfiles[x].endswith("xls") == True:
        data = pd.DataFrame(pe.get_array(file_name=dir_path + '\' + onlyfiles[x]))

#this is where something goes wrong (Result 1)
        data['dr_nr']= pd.Series(str(onlyfiles[x]), index=df.index)
#i tried replacing the line above with: (Result 2)
        data['dr_nr']= pd.Series(str(onlyfiles[x]), index=None)


        df = df.append(data, ignore_index=True)

结果 1:

   0  1  2       dr_nr
0   A  B  C         NaN
1   A  B  C         NaN
2   A  B  C         NaN
3   A  B  C         NaN
4   A  B  C         NaN
5   A  B  C  File2.xlsx
6   A  B  C  File2.xlsx
7   A  B  C  File2.xlsx
8   A  B  C  File2.xlsx
9   A  B  C  File2.xlsx
10  A  B  C  File3.xlsx
11  A  B  C  File3.xlsx
12  A  B  C  File3.xlsx
13  A  B  C  File3.xlsx
14  A  B  C  File3.xlsx 

结果 2:

   0  1  2       dr_nr
0   A  B  C  File1.xlsx
1   A  B  C         NaN
2   A  B  C         NaN
3   A  B  C         NaN
4   A  B  C         NaN
5   A  B  C  File2.xlsx
6   A  B  C         NaN
7   A  B  C         NaN
8   A  B  C         NaN
9   A  B  C         NaN
10  A  B  C  File3.xlsx
11  A  B  C         NaN
12  A  B  C         NaN
13  A  B  C         NaN
14  A  B  C         NaN

期望的结果:

    0  1  2       dr_nr
0   A  B  C  File1.xlsx
1   A  B  C  File1.xlsx      
2   A  B  C  File1.xlsx       
3   A  B  C  File1.xlsx       
4   A  B  C  File1.xlsx       
5   A  B  C  File2.xlsx
6   A  B  C  File2.xlsx
7   A  B  C  File2.xlsx
8   A  B  C  File2.xlsx
9   A  B  C  File2.xlsx
10  A  B  C  File3.xlsx
11  A  B  C  File3.xlsx
12  A  B  C  File3.xlsx
13  A  B  C  File3.xlsx
14  A  B  C  File3.xlsx

其他尝试

dr_nr = dr_nr.append(onlyfiles[x], ignore_index=True)
data['dr_nr']= pd.Series(str(onlyfiles[x][0:19]), index=df.index).fillna(value='Test')
data = pd.DataFrame({'dr_nr' : len(data)}, index=pd.RangeIndex(start=0, stop=99, step=1))            
data['dr_nr']= data['dr_nr'].fillna(value='test)
data['dr_nr']= pd.Series(str(onlyfiles[x][0:19]), ignore_index=True).fillna("test")
data['dr_nr'].fillna("test")

所以我似乎无法重现该问题。不幸的是,结果正是我想要的。

最小可重复样本

import pandas as pd
container = pd.DataFrame()
container2 = pd.DataFrame()

filenames = ['file1', 'file2', 'file3']
dataset1 = [['plastic', 5],['metal',3],['liquid',8]]
dataset2 = [['Dust', 2],['Rubber',1],['Fibres',9],['test',10]]
dataset3 = [['spam', 2],['eggs',1],['pickles',9]]

dataset4 = [dataset1, dataset2, dataset3]

df = pd.DataFrame(dataset1, columns=['material', 'quantity'])
df = pd.DataFrame(dataset2, columns=['material', 'quantity'])
df = pd.DataFrame(dataset3, columns=['material', 'quantity'])


for x in range(0,len(dataset4)):
    container = pd.DataFrame(dataset4[x])
    #container['filenames']= pd.Series(filenames[x], index = None)
    container['filenames']= pd.Series(filenames[x], index=container.index)
    container2 = container2.append(container, ignore_index=True)


print(container2)

输出

         0   1 filenames
0  plastic   5     file1
1    metal   3     file1
2   liquid   8     file1
3     Dust   2     file2
4   Rubber   1     file2
5   Fibres   9     file2
6     test  10     file2
7     spam   2     file3
8     eggs   1     file3
9  pickles   9     file3

将文件名设置到列表中,然后遍历名称并收集 df。最后连接所有收集的 df。

import pandas as pd

files_list = ['data1.csv', 'data2.csv']

df_list = []
for file in files_list:
    df = pd.read_csv(file)
    df['dr_nr'] = df.apply(lambda x : file, axis=1)
    df_list.append(df)

df_complete = pd.concat(df_list, ignore_index=True)

输出:

   A  B  C      dr_nr
0  1  1  1  data1.csv
1  2  2  2  data1.csv
2  3  3  3  data1.csv
3  4  4  4  data2.csv
4  5  5  5  data2.csv
5  6  6  6  data2.csv