PySpark 计算每一列的百分比 'missing'
PySpark calculate percentage that every column is 'missing'
我正在使用 PySpark 并尝试计算每列缺少 ('null') 个值的记录的百分比。
我们将使用的数据框:df
(以及更多列)
id
fb
linkedin
snapchat
...
1
aa
(null)
(null)
...
2
(null)
aaa
(null)
...
3
(null)
(null)
a
...
4
(null)
(null)
(null)
...
使用以下脚本,我可以获得每列的 'Null' 比率:
df.select([round((count(when(isnan(c) | col(c).isNull(), c))/count(lit(1))), 6).alias(c) for c in df.columns])
只是想知道我们如何计算每列具有 'null' 值的百分比?(假设有很多列,我们不想指定每个列的名称)
谢谢!
假设你不考虑缺失值计数的几列(这里我假设你的列id
不应该包含缺失值),你可以使用下面的代码
import pyspark.sql.functions as F
# select columns in which you want to check for missing values
relevant_columns = [c for c in df.columns if c != 'id']
# number of total records
n_records = df.count()
# percentage of rows with all missings in relevant_columns
my_perc = df \
.select((F.lit(len(relevant_columns)) - (sum(df[c].isNull().cast('int') for c in relevant_columns))).alias('n')) \
.filter(F.col('n') == 0) \
.count() / n_records * 100
print(my_perc)
# 25.0
另一种方法是创建自定义函数 - calc_null_percent
利用 Spark
和 Pandas
两全其美的优点
自定义函数,将包含每个列的 total_count
& null_count
数据准备
input_str = """
1,0,null,
1,null,0,
null,1,0,
1,0,0,
1,0,0,
null,0,1,
1,1,0,
1,1,null,
null,1,0
""".split(',')
input_values = list(map(lambda x: x.strip() if x.strip() != 'null' else None, input_str))
cols = list(map(lambda x: x.strip() if x.strip() != 'null' else None, "col1,col2,col3".split(',')))
n = len(input_values)
n_col = 3
input_list = [tuple(input_values[i:i+n_col]) for i in range(0,n,n_col)]
sparkDF = sql.createDataFrame(input_list, cols)
sparkDF.show()
+----+----+----+
|col1|col2|col3|
+----+----+----+
| 1| 0|null|
| 1|null| 0|
|null| 1| 0|
| 1| 0| 0|
| 1| 0| 0|
|null| 0| 1|
| 1| 1| 0|
| 1| 1|null|
|null| 1| 0|
+----+----+----+
自定义函数
def calc_null_percent(spark_df,sort=True):
pd_col_count = spark_df.select([F.count(F.col(c)).alias(c)
for (c,c_type) in spark_df.dtypes]
).toPandas().T.reset_index().rename(columns={0: 'total_count'
,'index':'column'})
pd_col_null_count = spark_df.select([F.sum(F.when(F.isnan(c) | F.isnull(c),1).otherwise(0)).alias(c)
for (c,c_type) in spark_df.dtypes]
).toPandas().T.reset_index().rename(columns={0: 'null_count'
,'index':'column'})
final_df = pd.merge(pd_col_count,pd_col_null_count,on=['column'])
final_df['null_percentage'] = final_df['null_count'] * 100 / final_df['total_count']
if len(final_df) == 0:
print("There are no any missing values!")
return None
return final_df
nullStatsDF = sql.createDataFrame(calc_null_percent(sparkDF))
nullStatsDF.show()
+------+-----------+----------+------------------+
|column|total_count|null_count| null_percentage|
+------+-----------+----------+------------------+
| col1| 6| 3| 50.0|
| col2| 8| 1| 12.5|
| col3| 7| 2|28.571428571428573|
+------+-----------+----------+------------------+
我正在使用 PySpark 并尝试计算每列缺少 ('null') 个值的记录的百分比。
我们将使用的数据框:df
(以及更多列)
id | fb | snapchat | ... | |
---|---|---|---|---|
1 | aa | (null) | (null) | ... |
2 | (null) | aaa | (null) | ... |
3 | (null) | (null) | a | ... |
4 | (null) | (null) | (null) | ... |
使用以下脚本,我可以获得每列的 'Null' 比率:
df.select([round((count(when(isnan(c) | col(c).isNull(), c))/count(lit(1))), 6).alias(c) for c in df.columns])
只是想知道我们如何计算每列具有 'null' 值的百分比?(假设有很多列,我们不想指定每个列的名称)
谢谢!
假设你不考虑缺失值计数的几列(这里我假设你的列id
不应该包含缺失值),你可以使用下面的代码
import pyspark.sql.functions as F
# select columns in which you want to check for missing values
relevant_columns = [c for c in df.columns if c != 'id']
# number of total records
n_records = df.count()
# percentage of rows with all missings in relevant_columns
my_perc = df \
.select((F.lit(len(relevant_columns)) - (sum(df[c].isNull().cast('int') for c in relevant_columns))).alias('n')) \
.filter(F.col('n') == 0) \
.count() / n_records * 100
print(my_perc)
# 25.0
另一种方法是创建自定义函数 - calc_null_percent
利用 Spark
和 Pandas
自定义函数,将包含每个列的 total_count
& null_count
数据准备
input_str = """
1,0,null,
1,null,0,
null,1,0,
1,0,0,
1,0,0,
null,0,1,
1,1,0,
1,1,null,
null,1,0
""".split(',')
input_values = list(map(lambda x: x.strip() if x.strip() != 'null' else None, input_str))
cols = list(map(lambda x: x.strip() if x.strip() != 'null' else None, "col1,col2,col3".split(',')))
n = len(input_values)
n_col = 3
input_list = [tuple(input_values[i:i+n_col]) for i in range(0,n,n_col)]
sparkDF = sql.createDataFrame(input_list, cols)
sparkDF.show()
+----+----+----+
|col1|col2|col3|
+----+----+----+
| 1| 0|null|
| 1|null| 0|
|null| 1| 0|
| 1| 0| 0|
| 1| 0| 0|
|null| 0| 1|
| 1| 1| 0|
| 1| 1|null|
|null| 1| 0|
+----+----+----+
自定义函数
def calc_null_percent(spark_df,sort=True):
pd_col_count = spark_df.select([F.count(F.col(c)).alias(c)
for (c,c_type) in spark_df.dtypes]
).toPandas().T.reset_index().rename(columns={0: 'total_count'
,'index':'column'})
pd_col_null_count = spark_df.select([F.sum(F.when(F.isnan(c) | F.isnull(c),1).otherwise(0)).alias(c)
for (c,c_type) in spark_df.dtypes]
).toPandas().T.reset_index().rename(columns={0: 'null_count'
,'index':'column'})
final_df = pd.merge(pd_col_count,pd_col_null_count,on=['column'])
final_df['null_percentage'] = final_df['null_count'] * 100 / final_df['total_count']
if len(final_df) == 0:
print("There are no any missing values!")
return None
return final_df
nullStatsDF = sql.createDataFrame(calc_null_percent(sparkDF))
nullStatsDF.show()
+------+-----------+----------+------------------+
|column|total_count|null_count| null_percentage|
+------+-----------+----------+------------------+
| col1| 6| 3| 50.0|
| col2| 8| 1| 12.5|
| col3| 7| 2|28.571428571428573|
+------+-----------+----------+------------------+