Spark 找不到 Window 函数
Spark Couldn't Find Window Function
使用中提供的解决方案
我尝试重新创建相同的查询,但使用编程语法代替 Dataframe
API,如下所示:
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
object HiveContextTest {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("HiveContextTest")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(
("foo", 1) :: ("foo", 2) :: ("bar", 1) :: ("bar", 2) :: Nil
).toDF("k", "v")
// using dataframe api works fine
val w = Window.partitionBy($"k").orderBy($"v")
df.select($"k",$"v", rowNumber().over(w).alias("rn")).show
//using programmatic syntax doesn't work
df.registerTempTable("df")
val w2 = sqlContext.sql("select k,v,rowNumber() over (partition by k order by v) as rn from df")
w2.show()
}
}
第一个 df.select($"k",$"v", rowNumber().over(w).alias("rn")).show
工作正常,但 w2.show()
结果
Exception in thread "main" org.apache.spark.sql.AnalysisException: Couldn't find window function rowNumber;
有人知道我如何使用编程语法来完成这项工作吗?非常感谢。
SQL 等价于 rowNumber
是 row_number
:
SELECT k, v, row_number() OVER (PARTITION BY k ORDER BY v) AS rn FROM df
使用Dataframe
API,如下所示:
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
object HiveContextTest {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("HiveContextTest")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(
("foo", 1) :: ("foo", 2) :: ("bar", 1) :: ("bar", 2) :: Nil
).toDF("k", "v")
// using dataframe api works fine
val w = Window.partitionBy($"k").orderBy($"v")
df.select($"k",$"v", rowNumber().over(w).alias("rn")).show
//using programmatic syntax doesn't work
df.registerTempTable("df")
val w2 = sqlContext.sql("select k,v,rowNumber() over (partition by k order by v) as rn from df")
w2.show()
}
}
第一个 df.select($"k",$"v", rowNumber().over(w).alias("rn")).show
工作正常,但 w2.show()
结果
Exception in thread "main" org.apache.spark.sql.AnalysisException: Couldn't find window function rowNumber;
有人知道我如何使用编程语法来完成这项工作吗?非常感谢。
SQL 等价于 rowNumber
是 row_number
:
SELECT k, v, row_number() OVER (PARTITION BY k ORDER BY v) AS rn FROM df