替换嵌套列表中的“NA”
Replacing 'NA's in a nested list
我正在尝试执行以下操作:确定嵌套列表中是否存在 'NA' 值,如果存在,则将其替换为列表中其他元素之和的平均值.列表的元素应该是浮点数。例如:
[["1.2","3.1","0.2"],["44.0","NA","90.0"]]
应该return
[[1.2, 3.1, 0.2], [44.0, 67.0, 90.0]]
下面的代码虽然又长又冗余,但有效:
def convert_data(data):
first = []
second = []
third = []
fourth = []
count = 0
for i in data:
for y in i:
if 'NA' not in i:
y = float(y)
first.append(y)
elif 'NA' in i:
a = i.index('NA')
second.append(y)
second[a] = 0
for q in second:
q = float(q)
third.append(q)
count+= q
length = len(third)
count = count/(length-1)
third[a] = count
fourth.extend([first,third])
return fourth
data = [["1.2","3.1","0.2"],["44.0","NA","90.0"]]
convert_data(data)
例如:
data = [["1.2","3.1","0.2"],["44.0","NA","90.0"]]
convert_data(data)
returns 所需的输出:
[[1.2, 3.1, 0.2], [44.0, 67.0, 90.0]]
但是如果 'NA' 在第一个列表中,例如
data = [["1.2","NA","0.2"],["44.0","67.00","90.0"]]
那就不是了。有人可以解释一下如何解决这个问题吗?
我建议使用 pandas 功能,因为这些类型的操作正是 pandas 的开发目的。只需几行代码就可以简单地实现你想要的:
import pandas as pd
data = [["1.2","NA","0.2"],["44.0","67.00","90.0"]]
df = pd.DataFrame(data).T.replace("NA", pd.np.nan).astype('<f8')
res = df.fillna(df.mean()).T.values.tolist()
哪个 returns 想要的输出:
[[1.2, 0.7, 0.2], [44.0, 67.0, 90.0]]
顺便说一句,在这个简单的例子中,你的代码对我来说工作得很好:
convert_data(data)
> [[44.0, 67.0, 90.0], [1.2, 0.7, 0.2]]
在更复杂的情况下,它肯定会开始失败或给出错误的结果,f.e。如果嵌套列表中的值超过 1 个 "NA"
,则会出现 ValueError 异常(您将尝试将字符串转换为浮点数)。
您的代码最终变得有点过于复杂的一个原因是您试图从解决 "nested list." 的问题开始,但实际上,您所需要的只是一个处理数字字符串列表的函数,其中包含一些"NA" 个值,然后您可以将该函数应用于列表中的每个项目。
def float_or_average(list_of_num_strings):
# First, convert every item that you can to a number. You need to do this
# before you can handle even ONE "NA" value, because the "NA" values need
# to be replaced with the average of all the numbers in the collection.
# So for now, convert ["1.2", "NA", "2.0"] to [1.2, "NA", 2.0]
parsed = []
# While we're at it, let's record the sum of the floats and their count,
# so that we can compute that average.
numeric_sum = 0.0
numeric_count = 0
for item in list_of_num_strings:
if item == "NA":
parsed.append(item)
else:
floating_point_value = float(item)
parsed.append(floating_point_value)
numeric_sum += floating_point_value
numeric_count += 1
# Now we can calculate the average:
average = numeric_sum / numeric_count
# And replace the "NA" values with them.
for i, item in enumerate(parsed):
if item == "NA":
parsed[i] == average
return parsed
# Or, with a list comprehension (replacing the previous four lines of
# code):
return [number if number != "NA" else average for number in parsed]
# Using this function on a nested list is as easy as
example_data = [["1.2", "3.1", "0.2"], ["44.0", "NA", "90.0"]]
parsed_nested_list = []
for sublist in example_data:
parsed_nested_list.append(float_or_average(sublist))
# Or, using a list comprehension (replacing the previous three lines of code):
parsed_nested_list = [float_or_average(sublist) for sublist in example_data]
def convert_data(data):
for lst in data:
sum = 0
index_na = list()
for elem in range(len(lst)):
if lst[elem] != 'NA':
sum += float(lst[elem])
lst[elem] = float(lst[elem])
else:
index_na.append(elem)
if len(index_na) > 0:
len_values = sum / (len(lst)-len(index_na))
for i in index_na:
lst[i] = float("{0:.2f}".format(len_values))
return data
这应该可以解决问题,使用 numpy
:
import numpy as np
x=[["1.2","3.1","0.2"],["44.0","NA","90.0"]]
#convert to float
x=np.char.replace(np.array(x), "NA", "nan").astype(np.float)
#replace nan-s with mean
mask=x.astype(str)=="nan"
x[mask]=np.nanmean(x, axis=1)[mask.any(axis=1)]
输出:
[[ 1.2 3.1 0.2]
[44. 67. 90. ]]
data_var = [["1.2", "3.1", "0.2"], ["44.0", "NA", "90.0"]]
def replace_na_with_mean(list_entry):
for i in range(len(list_entry)):
index_list = []
m = 0
while 'NA' in list_entry[i]:
index_list.append(list_entry[i].index('NA') + m)
del list_entry[i][list_entry[i].index('NA')]
if list_entry[i]:
for n in range(len(list_entry[i])):
list_entry[i][n] = float(list_entry[i][n])
if index_list:
if list_entry[i]:
avg = sum(list_entry[i]) / len(list_entry[i])
else:
avg = 0
for l in index_list:
list_entry[i].insert(l, avg)
return list_entry
print(replace_na_with_mean(data_var))
我正在尝试执行以下操作:确定嵌套列表中是否存在 'NA' 值,如果存在,则将其替换为列表中其他元素之和的平均值.列表的元素应该是浮点数。例如:
[["1.2","3.1","0.2"],["44.0","NA","90.0"]]
应该return
[[1.2, 3.1, 0.2], [44.0, 67.0, 90.0]]
下面的代码虽然又长又冗余,但有效:
def convert_data(data):
first = []
second = []
third = []
fourth = []
count = 0
for i in data:
for y in i:
if 'NA' not in i:
y = float(y)
first.append(y)
elif 'NA' in i:
a = i.index('NA')
second.append(y)
second[a] = 0
for q in second:
q = float(q)
third.append(q)
count+= q
length = len(third)
count = count/(length-1)
third[a] = count
fourth.extend([first,third])
return fourth
data = [["1.2","3.1","0.2"],["44.0","NA","90.0"]]
convert_data(data)
例如:
data = [["1.2","3.1","0.2"],["44.0","NA","90.0"]]
convert_data(data)
returns 所需的输出:
[[1.2, 3.1, 0.2], [44.0, 67.0, 90.0]]
但是如果 'NA' 在第一个列表中,例如
data = [["1.2","NA","0.2"],["44.0","67.00","90.0"]]
那就不是了。有人可以解释一下如何解决这个问题吗?
我建议使用 pandas 功能,因为这些类型的操作正是 pandas 的开发目的。只需几行代码就可以简单地实现你想要的:
import pandas as pd
data = [["1.2","NA","0.2"],["44.0","67.00","90.0"]]
df = pd.DataFrame(data).T.replace("NA", pd.np.nan).astype('<f8')
res = df.fillna(df.mean()).T.values.tolist()
哪个 returns 想要的输出:
[[1.2, 0.7, 0.2], [44.0, 67.0, 90.0]]
顺便说一句,在这个简单的例子中,你的代码对我来说工作得很好:
convert_data(data)
> [[44.0, 67.0, 90.0], [1.2, 0.7, 0.2]]
在更复杂的情况下,它肯定会开始失败或给出错误的结果,f.e。如果嵌套列表中的值超过 1 个 "NA"
,则会出现 ValueError 异常(您将尝试将字符串转换为浮点数)。
您的代码最终变得有点过于复杂的一个原因是您试图从解决 "nested list." 的问题开始,但实际上,您所需要的只是一个处理数字字符串列表的函数,其中包含一些"NA" 个值,然后您可以将该函数应用于列表中的每个项目。
def float_or_average(list_of_num_strings):
# First, convert every item that you can to a number. You need to do this
# before you can handle even ONE "NA" value, because the "NA" values need
# to be replaced with the average of all the numbers in the collection.
# So for now, convert ["1.2", "NA", "2.0"] to [1.2, "NA", 2.0]
parsed = []
# While we're at it, let's record the sum of the floats and their count,
# so that we can compute that average.
numeric_sum = 0.0
numeric_count = 0
for item in list_of_num_strings:
if item == "NA":
parsed.append(item)
else:
floating_point_value = float(item)
parsed.append(floating_point_value)
numeric_sum += floating_point_value
numeric_count += 1
# Now we can calculate the average:
average = numeric_sum / numeric_count
# And replace the "NA" values with them.
for i, item in enumerate(parsed):
if item == "NA":
parsed[i] == average
return parsed
# Or, with a list comprehension (replacing the previous four lines of
# code):
return [number if number != "NA" else average for number in parsed]
# Using this function on a nested list is as easy as
example_data = [["1.2", "3.1", "0.2"], ["44.0", "NA", "90.0"]]
parsed_nested_list = []
for sublist in example_data:
parsed_nested_list.append(float_or_average(sublist))
# Or, using a list comprehension (replacing the previous three lines of code):
parsed_nested_list = [float_or_average(sublist) for sublist in example_data]
def convert_data(data):
for lst in data:
sum = 0
index_na = list()
for elem in range(len(lst)):
if lst[elem] != 'NA':
sum += float(lst[elem])
lst[elem] = float(lst[elem])
else:
index_na.append(elem)
if len(index_na) > 0:
len_values = sum / (len(lst)-len(index_na))
for i in index_na:
lst[i] = float("{0:.2f}".format(len_values))
return data
这应该可以解决问题,使用 numpy
:
import numpy as np
x=[["1.2","3.1","0.2"],["44.0","NA","90.0"]]
#convert to float
x=np.char.replace(np.array(x), "NA", "nan").astype(np.float)
#replace nan-s with mean
mask=x.astype(str)=="nan"
x[mask]=np.nanmean(x, axis=1)[mask.any(axis=1)]
输出:
[[ 1.2 3.1 0.2]
[44. 67. 90. ]]
data_var = [["1.2", "3.1", "0.2"], ["44.0", "NA", "90.0"]]
def replace_na_with_mean(list_entry):
for i in range(len(list_entry)):
index_list = []
m = 0
while 'NA' in list_entry[i]:
index_list.append(list_entry[i].index('NA') + m)
del list_entry[i][list_entry[i].index('NA')]
if list_entry[i]:
for n in range(len(list_entry[i])):
list_entry[i][n] = float(list_entry[i][n])
if index_list:
if list_entry[i]:
avg = sum(list_entry[i]) / len(list_entry[i])
else:
avg = 0
for l in index_list:
list_entry[i].insert(l, avg)
return list_entry
print(replace_na_with_mean(data_var))