为什么我在使用 spark 的 mllib 的 MulticlassClassificationEvaluator 时得到相同的准确率和召回率值?
Why am I getting the same value for the accuracy and recall when using spark's mllib's MulticlassClassificationEvaluator?
所以,我正在尝试使用 Spark 的 mllib 中的一些基于树的算法。我的代码在这里;
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
conf = SparkConf()
conf.set('spark.logConf', 'true').set("spark.ui.port", "4060")
spark = SparkSession.builder.config(conf=conf).appName("Gradient Boosted Tree").getOrCreate()
data = spark.read.parquet('/mydata/location)
def yt_func(x):
if x <= 10:
yt = 0
else:
yt = 1
return yt
yt_udf = udf(yt_func, IntegerType())
data = data.withColumn('yt_1',yt_udf(data['count']))
datasub = data.select('feature1', 'feature2',
'feature3', 'feature4',
'feature5', 'feature6',
'feature7', 'feature8',
'feature9', 'feature10',
'feature11','feature12',
'feature13')
datasub = datasub.na.fill(0)
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['feature1', 'feature2',
'feature3', 'feature4',
'feature5', 'feature6',
'feature7', 'feature8',
'feature9', 'feature10',
'feature11','feature12',
'feature13'], outputCol = 'features')
output = assembler.transform(datasub)
finaldata = output.select('features','yt_1')
train_data,test_data = finaldata.randomSplit([0.7,0.3])
finaldata.show(20)
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='yt_1')
rfc = RandomForestClassifier(featuresCol='features',labelCol='yt_1', numTrees=70)
gbt = GBTClassifier(featuresCol='features',labelCol='yt_1')
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
dtc_preds.show()
rfc_preds.show()
gbt_preds.show()
accuracy_eval = MulticlassClassificationEvaluator(metricName = 'accuracy', labelCol='yt_1')
recall_eval = MulticlassClassificationEvaluator(metricName = 'weightedRecall', labelCol='yt_1')
print 'dtc accuracy:', accuracy_eval.evaluate(dtc_preds)
print 'dtc recall', recall_eval.evaluate(dtc_preds)
print 'rfc accuracy:', accuracy_eval.evaluate(rfc_preds)
print 'rfc recall', recall_eval.evaluate(rfc_preds)
print 'gbt accuracy:', accuracy_eval.evaluate(gbt_preds)
print 'gbt recall', recall_eval.evaluate(gbt_preds)
当我 运行 时,我得到以下信息;
dtc accuracy: 0.98596755767033761
dtc recall: 0.98596755767033761
rfc accuracy: 0.98551077243825225
rfc recall: 0.98551077243825225
gbt accuracy: 0.98624595624862965
gbt recall: 0.98624595624862965
让我感到困惑的是为什么我得到相同的准确率和召回率值……它们完全相同。这肯定是不正确的....??
有什么想法吗?
可以在我在 Data Science Stack Exchange 上发布相同问题的地方找到这个问题的答案
所以,我正在尝试使用 Spark 的 mllib 中的一些基于树的算法。我的代码在这里;
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
conf = SparkConf()
conf.set('spark.logConf', 'true').set("spark.ui.port", "4060")
spark = SparkSession.builder.config(conf=conf).appName("Gradient Boosted Tree").getOrCreate()
data = spark.read.parquet('/mydata/location)
def yt_func(x):
if x <= 10:
yt = 0
else:
yt = 1
return yt
yt_udf = udf(yt_func, IntegerType())
data = data.withColumn('yt_1',yt_udf(data['count']))
datasub = data.select('feature1', 'feature2',
'feature3', 'feature4',
'feature5', 'feature6',
'feature7', 'feature8',
'feature9', 'feature10',
'feature11','feature12',
'feature13')
datasub = datasub.na.fill(0)
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['feature1', 'feature2',
'feature3', 'feature4',
'feature5', 'feature6',
'feature7', 'feature8',
'feature9', 'feature10',
'feature11','feature12',
'feature13'], outputCol = 'features')
output = assembler.transform(datasub)
finaldata = output.select('features','yt_1')
train_data,test_data = finaldata.randomSplit([0.7,0.3])
finaldata.show(20)
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='yt_1')
rfc = RandomForestClassifier(featuresCol='features',labelCol='yt_1', numTrees=70)
gbt = GBTClassifier(featuresCol='features',labelCol='yt_1')
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
dtc_preds.show()
rfc_preds.show()
gbt_preds.show()
accuracy_eval = MulticlassClassificationEvaluator(metricName = 'accuracy', labelCol='yt_1')
recall_eval = MulticlassClassificationEvaluator(metricName = 'weightedRecall', labelCol='yt_1')
print 'dtc accuracy:', accuracy_eval.evaluate(dtc_preds)
print 'dtc recall', recall_eval.evaluate(dtc_preds)
print 'rfc accuracy:', accuracy_eval.evaluate(rfc_preds)
print 'rfc recall', recall_eval.evaluate(rfc_preds)
print 'gbt accuracy:', accuracy_eval.evaluate(gbt_preds)
print 'gbt recall', recall_eval.evaluate(gbt_preds)
当我 运行 时,我得到以下信息;
dtc accuracy: 0.98596755767033761
dtc recall: 0.98596755767033761
rfc accuracy: 0.98551077243825225
rfc recall: 0.98551077243825225
gbt accuracy: 0.98624595624862965
gbt recall: 0.98624595624862965
让我感到困惑的是为什么我得到相同的准确率和召回率值……它们完全相同。这肯定是不正确的....??
有什么想法吗?
可以在我在 Data Science Stack Exchange 上发布相同问题的地方找到这个问题的答案