为什么我在尝试Apache Spark的Streaming Kmeans Clustering官方示例时出现类型错误model.predictOnValues?

Why do I get a type error in model.predictOnValues when I try the official example of Streaming Kmeans Clustering of Apache Spark?

我正在尝试流式集群示例代码 at the end of the official guide,但出现类型错误。这是我的代码:

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.clustering.StreamingKMeans

object Kmeans {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local[2]").setAppName("kmeans")
    val ssc = new StreamingContext(conf, Seconds(3))

    val trainingData = ssc.textFileStream("training").map(Vectors.parse)
    val testData = ssc.textFileStream("test").map(LabeledPoint.parse)

    val numDimensions = 3
    val numClusters = 2
    val model = new StreamingKMeans()
      .setK(numClusters)
      .setDecayFactor(1.0)
      .setRandomCenters(numDimensions, 0.0)

    model.trainOn(trainingData)
    model.predictOnValues(testData).print()

    ssc.start()
    ssc.awaitTermination()
  }
}

但是当我运行

sbt package

我收到以下错误:

[error]  found   : org.apache.spark.streaming.dstream.DStream[org.apache.spark.mllib.regression.LabeledPoint]
[error]  required: org.apache.spark.streaming.dstream.DStream[(?, org.apache.spark.mllib.linalg.Vector)]
[error]     model.predictOnValues(testData).print()
[error]                           ^
[error] one error found
[error] (compile:compile) Compilation failed

您需要将 testData: DStream[LabeledPoint] 映射到 DStream[(K, Vector)]:

model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

您可以在此处找到完整的示例:StreamingKMeansExample.scala