在 spark 中使用 Complex JSON 创建 spark Dataframe
Create spark Dataframe with Complex JSON in spark
我将以下 json 作为数据框加载:
root
|-- transactions_analytics_data: struct (nullable = true)
| |-- cumulative_fraud_transactions: long (nullable = true)
| |-- cumulative_fraud_transactions_percent: double (nullable = true)
| |-- cumulative_transactions: long (nullable = true)
| |-- cumulative_transactions_percent: double (nullable = true)
| |-- false_predictions: struct (nullable = true)
| | |-- no_of_false_negatives: long (nullable = true)
| | |-- no_of_false_positives: long (nullable = true)
| |-- false_predictions_amount: struct (nullable = true)
| | |-- predicted_false_negative: double (nullable = true)
| | |-- predicted_false_positive: double (nullable = true)
| |-- fraud_transactions_barline: struct (nullable = true)
| | |-- data: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- falseNegatives: long (nullable = true)
| | | | |-- time: string (nullable = true)
| | | | |-- totalFrauds: long (nullable = true)
| |-- fraud_transactions_map: struct (nullable = true)
| | |-- California: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Connecticut: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Delaware: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Florida: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Georgia: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Hawaii: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Idaho: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Illinois: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Indiana: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Iowa: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Kansas: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Kentucky: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Louisiana: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Maine: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Maryland: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Massachusetts: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Michigan: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Minnesota: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Mississippi: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Missouri: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Montana: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Nebraska: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Nevada: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New Hampshire: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New Jersey: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New Mexico: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New York: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- North Carolina: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- North Dakota: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Ohio: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Oklahoma: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Oregon: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Pennsylvania: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Rhode Island: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- South Carolina: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- South Dakota: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Tennessee: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Texas: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Utah: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Vermont: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Virginia: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Washington: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Wisconsin: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| |-- top10_affected_merchants: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- count: long (nullable = true)
| | | |-- date_affected: string (nullable = true)
| | | |-- merchange_category: string (nullable = true)
我想要一个将每个属性作为列的 DataFrame,我尝试了下面的 top10_affected_merchants 它工作正常,因为它是数组但它不适用于结构,请帮助我
val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
import sqlContext.implicits._
val jsonDF = sqlContext.jsonFile("file:///root/sample.json")
jsonDF.printSchema()
///TOP 10 MARCHAENTS DATA SET
val top10_affected_merchantsDF = jsonDF.select(explode(jsonDF("transactions_analytics_data.top10_affected_merchants")))
top10_affected_merchantsDF.printSchema()
top10_affected_merchantsDF.registerTempTable("top10affectedmerchants")
val topaffectedmerchantsDF = sqlContext.sql("SELECT col.count, col.date_affected, col.merchange_category FROM top10affectedmerchants")
topaffectedmerchantsDF.printSchema()
topaffectedmerchantsDF.show()
这应该可以解决您的问题。
val data = sqlContext.jsonFile("path/to/file")
val df = data.select(
$"transactions_analytics_data.fraud_transactions_barline.data".alias("trsns_anlytcs_data_barline_data"),
$"transactions_analytics_data.fraud_transactions_map.California.falseNegatives".alias("cali_falseNegatives"),
$"transactions_analytics_data.false_predictions.no_of_false_negatives",
$"transactions_analytics_data.false_predictions.no_of_false_positives",
$"transactions_analytics_data.top10_affected_merchants".alias("top10"),
$"transactions_analytics_data.cumulative_fraud_transactions")
val df1 = df.select($"trsns_anlytcs_data_barline_data",
$"cali_falseNegatives",
$"no_of_false_negatives",$"no_of_false_positives",
$"cumulative_fraud_transactions",explode($"top10").alias("top10"))
.select($"trsns_anlytcs_data_barline_data",$"top10.count".alias("top10_count"),
$"top10.date_affected".alias("top10_date_affected"),
$"cali_falseNegatives",$"no_of_false_negatives",
$"no_of_false_positives",$"cumulative_fraud_transactions")
val df2 = df1.select(explode($"trsns_anlytcs_data_barline_data").alias("barline_data"),
$"trsns_anlytcs_data_data",$"top10_count",$"top10_date_affected",
$"cali_falseNegatives",$"no_of_false_negatives",
$"no_of_false_positives",$"cumulative_fraud_transactions")
.select("barline_data.falseNegatives","barline_data.totalFrauds",
"trsns_anlytcs_data_data")
因此,如果您的嵌套字段只有结构而没有数组,则无法分解它们。
root
|-- col: struct (nullable = true)
| |-- col1: long (nullable = true)
| |-- col2: struct (nullable = true)
| | |-- col_1_1: long
| |-- col3: struct (nullable = true)
| | |-- col3_1: array (nullable = true)
| | | |-- col3_2: struct (containsNull = true)
| | | | |-- col3_3: long (nullable = true)
| | | | |-- col3_4: string (nullable = true)
根据上面的数据,要读取 col_1_1(嵌套结构字段),您只需按以下格式编写即可
"col.col2.col3"
如果尝试阅读 col3_3 或 col3_4,您的 select 语句将看起来像像这样
val DF = data.select($"col.col3.col3_1") //col3_1 will be name of resulting field
.select(explode($"col3_1").alias("col3_1")) //exploding col3_1 as it is a array struct
请注意,由于您在嵌套属性中有很多相似的命名字段,请务必正确重命名您的属性。
我将以下 json 作为数据框加载:
root
|-- transactions_analytics_data: struct (nullable = true)
| |-- cumulative_fraud_transactions: long (nullable = true)
| |-- cumulative_fraud_transactions_percent: double (nullable = true)
| |-- cumulative_transactions: long (nullable = true)
| |-- cumulative_transactions_percent: double (nullable = true)
| |-- false_predictions: struct (nullable = true)
| | |-- no_of_false_negatives: long (nullable = true)
| | |-- no_of_false_positives: long (nullable = true)
| |-- false_predictions_amount: struct (nullable = true)
| | |-- predicted_false_negative: double (nullable = true)
| | |-- predicted_false_positive: double (nullable = true)
| |-- fraud_transactions_barline: struct (nullable = true)
| | |-- data: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- falseNegatives: long (nullable = true)
| | | | |-- time: string (nullable = true)
| | | | |-- totalFrauds: long (nullable = true)
| |-- fraud_transactions_map: struct (nullable = true)
| | |-- California: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Connecticut: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Delaware: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Florida: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Georgia: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Hawaii: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Idaho: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Illinois: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Indiana: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Iowa: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Kansas: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Kentucky: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Louisiana: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Maine: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Maryland: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Massachusetts: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Michigan: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Minnesota: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Mississippi: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Missouri: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Montana: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Nebraska: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Nevada: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New Hampshire: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New Jersey: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New Mexico: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- New York: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- North Carolina: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- North Dakota: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Ohio: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Oklahoma: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Oregon: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Pennsylvania: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Rhode Island: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- South Carolina: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- South Dakota: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Tennessee: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Texas: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Utah: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Vermont: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Virginia: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Washington: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| | |-- Wisconsin: struct (nullable = true)
| | | |-- falseNegatives: long (nullable = true)
| | | |-- falsePositives: long (nullable = true)
| | | |-- totalFrauds: long (nullable = true)
| |-- top10_affected_merchants: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- count: long (nullable = true)
| | | |-- date_affected: string (nullable = true)
| | | |-- merchange_category: string (nullable = true)
我想要一个将每个属性作为列的 DataFrame,我尝试了下面的 top10_affected_merchants 它工作正常,因为它是数组但它不适用于结构,请帮助我
val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
import sqlContext.implicits._
val jsonDF = sqlContext.jsonFile("file:///root/sample.json")
jsonDF.printSchema()
///TOP 10 MARCHAENTS DATA SET
val top10_affected_merchantsDF = jsonDF.select(explode(jsonDF("transactions_analytics_data.top10_affected_merchants")))
top10_affected_merchantsDF.printSchema()
top10_affected_merchantsDF.registerTempTable("top10affectedmerchants")
val topaffectedmerchantsDF = sqlContext.sql("SELECT col.count, col.date_affected, col.merchange_category FROM top10affectedmerchants")
topaffectedmerchantsDF.printSchema()
topaffectedmerchantsDF.show()
这应该可以解决您的问题。
val data = sqlContext.jsonFile("path/to/file")
val df = data.select(
$"transactions_analytics_data.fraud_transactions_barline.data".alias("trsns_anlytcs_data_barline_data"),
$"transactions_analytics_data.fraud_transactions_map.California.falseNegatives".alias("cali_falseNegatives"),
$"transactions_analytics_data.false_predictions.no_of_false_negatives",
$"transactions_analytics_data.false_predictions.no_of_false_positives",
$"transactions_analytics_data.top10_affected_merchants".alias("top10"),
$"transactions_analytics_data.cumulative_fraud_transactions")
val df1 = df.select($"trsns_anlytcs_data_barline_data",
$"cali_falseNegatives",
$"no_of_false_negatives",$"no_of_false_positives",
$"cumulative_fraud_transactions",explode($"top10").alias("top10"))
.select($"trsns_anlytcs_data_barline_data",$"top10.count".alias("top10_count"),
$"top10.date_affected".alias("top10_date_affected"),
$"cali_falseNegatives",$"no_of_false_negatives",
$"no_of_false_positives",$"cumulative_fraud_transactions")
val df2 = df1.select(explode($"trsns_anlytcs_data_barline_data").alias("barline_data"),
$"trsns_anlytcs_data_data",$"top10_count",$"top10_date_affected",
$"cali_falseNegatives",$"no_of_false_negatives",
$"no_of_false_positives",$"cumulative_fraud_transactions")
.select("barline_data.falseNegatives","barline_data.totalFrauds",
"trsns_anlytcs_data_data")
因此,如果您的嵌套字段只有结构而没有数组,则无法分解它们。
root
|-- col: struct (nullable = true)
| |-- col1: long (nullable = true)
| |-- col2: struct (nullable = true)
| | |-- col_1_1: long
| |-- col3: struct (nullable = true)
| | |-- col3_1: array (nullable = true)
| | | |-- col3_2: struct (containsNull = true)
| | | | |-- col3_3: long (nullable = true)
| | | | |-- col3_4: string (nullable = true)
根据上面的数据,要读取 col_1_1(嵌套结构字段),您只需按以下格式编写即可
"col.col2.col3"
如果尝试阅读 col3_3 或 col3_4,您的 select 语句将看起来像像这样
val DF = data.select($"col.col3.col3_1") //col3_1 will be name of resulting field
.select(explode($"col3_1").alias("col3_1")) //exploding col3_1 as it is a array struct
请注意,由于您在嵌套属性中有很多相似的命名字段,请务必正确重命名您的属性。