app开发者平台在数字化时代的重要性与发展趋势解析
612
2022-09-02
二分类模型评价指标-Scala实现
本文主要使用Scala计算二分类模型的评价指标,包括以下内容:
precision、recall、F1ScoreaccuracyAUCKS
对上述指标计算方法进行封装,方便调用。传入参数为预测的数据框
构造数据
简单的构造数据,得到预测的DataFrame,其包含预测的概率、label和真实的label。
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}import org.apache.spark.ml.tuning.ParamGridBuilderimport org.apache.spark.ml.param.ParamMapimport org.apache.spark.ml.tuning.{CrossValidator}import org.apache.spark.ml.classification.LogisticRegressionimport org.apache.spark.sql.functions._import org.apache.spark.ml.feature.VectorAssemblerimport org.apache.spark.ml.linalg.{Vector, Vectors}import org.apache.spark.sql.{DataFrame, Row, SparkSession}val builder = SparkSession .builder() .appName("LR") .config("spark.executor.heartbeatInterval","60s") .config("spark-work.timeout","120s") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.kryoserializer.buffer.max","512m") .config("spark.dynamicAllocation.enabled", false) .config("spark.sql.inMemoryColumnarStorage.compressed", true) .config("spark.sql.inMemoryColumnarStorage.batchSize", 10000) .config("spark.sql.broadcastTimeout", 600) .config("spark.sql.autoBroadcastJoinThreshold", -1) .config("spark.sql.crossJoin.enabled", true) .master("local[*]") val spark = builder.getOrCreate()spark.sparkContext.setLogLevel("ERROR")import spark.implicits._
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}import org.apache.spark.ml.tuning.ParamGridBuilderimport org.apache.spark.ml.param.ParamMapimport org.apache.spark.ml.tuning.CrossValidatorimport org.apache.spark.ml.classification.LogisticRegressionimport org.apache.spark.sql.functions._import org.apache.spark.ml.feature.VectorAssemblerimport org.apache.spark.ml.linalg.{Vector, Vectors}import org.apache.spark.sql.{DataFrame, Row, SparkSession}builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@20fd3d0aspark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@58767892import spark.implicits._
var dfTrain = Seq( (1, 5.1, 3.5, 1.4, 0.2, 0), (2, 4.9, 3.0, 1.4, 0.2, 1), (3, 4.7, 3.2, 1.3, 0.2, 0), (4, 4.6, 3.1, 1.5, 0.2, 1), (5, 5.0, 3.6, 1.4, 0.2, 0), (56, 5.7, 2.8, 4.5, 1.3,1), (57, 5.3, 3.3, 4.7, 1.6,0), (58, 4.9, 2.4, 3.3, 1.0,1), (59, 6.6, 3.9, 4.6, 1.3,1), (60, 5.2, 2.7, 3.9, 1.4,0) ).toDF("id","x1","x2", "x3","x4","label")// 测试集直接copy就行了,仅用来测试var dfTest = dfTraindfTrain.show()
+---+---+---+---+---+-----+| id| x1| x2| x3| x4|label|+---+---+---+---+---+-----+| 1|5.1|3.5|1.4|0.2| 0|| 2|4.9|3.0|1.4|0.2| 1|| 3|4.7|3.2|1.3|0.2| 0|| 4|4.6|3.1|1.5|0.2| 1|| 5|5.0|3.6|1.4|0.2| 0|| 56|5.7|2.8|4.5|1.3| 1|| 57|5.3|3.3|4.7|1.6| 0|| 58|4.9|2.4|3.3|1.0| 1|| 59|6.6|3.9|4.6|1.3| 1|| 60|5.2|2.7|3.9|1.4| 0|+---+---+---+---+---+-----+dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
// 数据转换val assemble = new VectorAssembler() .setInputCols(Array("x1","x2","x3","x4")) .setOutputCol("features")// 模型 val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01) .setLabelCol("label")//设置训练集真实label列名,默认也是"label" .setFeaturesCol("features")//设置训练集特征列,默认"features" .setPredictionCol("preLabel")//设置预测结果中label列名,默认为"prediction" .setProbabilityCol("prob")//设置预测结果中概率列名,默认"probability"// 模型val pipeline = new Pipeline().setStages(Array(assemble, lr))val Model = pipeline.fit(dfTrain)
assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_2bfbc5f2ff24lr: org.apache.spark.ml.classification.LogisticRegression = logreg_6fd6be208198pipeline: org.apache.spark.ml.Pipeline = pipeline_5b2f95daec89Model: org.apache.spark.ml.PipelineModel = pipeline_5b2f95daec89
val preResult = Model.transform(dfTest)preResult.show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+| id| x1| x2| x3| x4|label| features| rawPrediction| prob|preLabel|+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+| 1|5.1|3.5|1.4|0.2| 0|[5.1,3.5,1.4,0.2]|[0.51973594549227...|[0.62708601946512...| 0.0|| 2|4.9|3.0|1.4|0.2| 1|[4.9,3.0,1.4,0.2]|[-1.0899529556434...|[0.25162713725554...| 1.0|| 3|4.7|3.2|1.3|0.2| 0|[4.7,3.2,1.3,0.2]|[0.41832944562126...|[0.60308343184906...| 0.0|| 4|4.6|3.1|1.5|0.2| 1|[4.6,3.1,1.5,0.2]|[0.24687940631850...|[0.56140826798745...| 0.0|| 5|5.0|3.6|1.4|0.2| 0|[5.0,3.6,1.4,0.2]|[1.26603211145541...|[0.78006275423495...| 0.0|| 56|5.7|2.8|4.5|1.3| 1|[5.7,2.8,4.5,1.3]|[-2.0093807897371...|[0.11822151224039...| 1.0|| 57|5.3|3.3|4.7|1.6| 0|[5.3,3.3,4.7,1.6]|[2.17258007146063...|[0.89776002662622...| 0.0|| 58|4.9|2.4|3.3|1.0| 1|[4.9,2.4,3.3,1.0]|[-1.9539003322336...|[0.12412868907566...| 1.0|| 59|6.6|3.9|4.6|1.3| 1|[6.6,3.9,4.6,1.3]|[0.12192431314750...|[0.53044337453190...| 0.0|| 60|5.2|2.7|3.9|1.4| 0|[5.2,2.7,3.9,1.4]|[-0.5811999760827...|[0.35865652594949...| 1.0|+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+preResult: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
可以看到预测结果是数据集,包含概率和label(probability,prediction),以及真实的label
混淆矩阵
import spark.implicits._val preLabel ="preLabel"val preProb = "prob"val trueLabel = "label"val PredictDf = preResultpreResult.filter(s"$preLabel==1").show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+| id| x1| x2| x3| x4|label| features| rawPrediction| prob|preLabel|+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+| 2|4.9|3.0|1.4|0.2| 1|[4.9,3.0,1.4,0.2]|[-1.0899529556434...|[0.25162713725554...| 1.0|| 56|5.7|2.8|4.5|1.3| 1|[5.7,2.8,4.5,1.3]|[-2.0093807897371...|[0.11822151224039...| 1.0|| 58|4.9|2.4|3.3|1.0| 1|[4.9,2.4,3.3,1.0]|[-1.9539003322336...|[0.12412868907566...| 1.0|| 60|5.2|2.7|3.9|1.4| 0|[5.2,2.7,3.9,1.4]|[-0.5811999760827...|[0.35865652594949...| 1.0|+---+---+---+---+---+-----+-----------------+--------------------+--------------------+--------+import spark.implicits._preLabel: String = preLabelpreProb: String = probtrueLabel: String = labelPredictDf: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]
// import spark.implicits._val preLabel ="preLabel"val preProb = "prob"val trueLabel = "label"val PredictDf = preResult// --- 统计TP、FP、FN、TN// 实际为正,预测为正val TP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 1").count().toDouble// 实际为负,预测为正val FP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 0").count().toDouble// 实际为负,预测为负val TN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 0").count().toDouble// 实际为正,预测为负val FN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 1").count().toDouble// 自己拼的混淆矩阵,中文jupyter有bug就不写了。。。println("\t " + "Pre Neg " + "Pre Pos " + "\n" + "True Neg " + TN + " " + FP + "\n"+ "True Pos " + FN + " " + TP + "\n" )// 直接groupByPredictDf.groupBy(s"$trueLabel") .pivot(s"$preLabel", (0 to 1)).count().na.fill(0.0) .orderBy(asc(s"$trueLabel")) .withColumnRenamed(s"$trueLabel", "True-Predict") .show(truncate = true)
Pre Neg Pre Pos True Neg 4.0 1.0True Pos 2.0 3.0+------------+---+---+|True-Predict| 0| 1|+------------+---+---+| 0| 4| 1|| 1| 2| 3|+------------+---+---+preLabel: String = preLabelpreProb: String = probtrueLabel: String = labelPredictDf: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 8 more fields]TP: Double = 3.0FP: Double = 1.0TN: Double = 4.0FN: Double = 2.0
accuarcy&precision&recall&f1
混淆矩阵都有了,precision、recall、f1Score都很好计算
// 正样本println("正样本precision: " + TP / (TP + FP))println("正样本recall: " + TP/(TP+FN))println("正样本F1Score: " + 2*TP/(2*TP+FP+FN))// 负样本println("负样本precision: " + TN / (TN + FN))println("负样本recall: " + TN/(FP+TN))println("负样本F1Score: " + 2*TN/(2*TN+FP+FN))
println("Accuaracy: "+(TP+TN)/(TP+TN+FP+FN))// 负样本println("Neg precision: " + TN / (TN + FN))println("Neg recall: " + TN/(FP+TN))println("Neg F1Score: " + 2*TN/(2*TN+FP+FN))// 正样本println("Pos precision: " + TP / (TP + FP))println("Pos recall: " + TP/(TP+FN))println("Pos F1Score: " + 2*TP/(2*TP+FP+FN))
Accuaracy: 0.7Neg precision: 0.6666666666666666Neg recall: 0.8Neg F1Score: 0.7272727272727273Pos precision: 0.75Pos recall: 0.6Pos F1Score: 0.6666666666666666
也可以直接掉包计算
import org.apache.spark.mllib.evaluation.MulticlassMetricsval predictionRDD = PredictDf.select(preLabel, trueLabel).as[(Double, Double)].rddval multiclassMetrics = new MulticlassMetrics(predictionRDD)println("Accuaracy: "+multiclassMetrics.accuracy)val labels = multiclassMetrics.labels labels.foreach { l => println(s"Precision($l) = " + multiclassMetrics.precision(l)) println(s"Recall($l) = " + multiclassMetrics.recall(l)) println(s"F1Score($l) = " + multiclassMetrics.fMeasure(l)) }
Accuaracy: 0.7Precision(0.0) = 0.6666666666666666Recall(0.0) = 0.8F1Score(0.0) = 0.7272727272727272Precision(1.0) = 0.75Recall(1.0) = 0.6F1Score(1.0) = 0.6666666666666665import org.apache.spark.mllib.evaluation.MulticlassMetricspredictionRDD: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1160] at rdd at
AUC和PRC
scala中提供了计算AUC和PRC的方法,但是需要传入rdd格式的数据。另外需要对数据概率字段进行切分,原始字段中是Vector。先看下数据切分的代码:
val aucDf = PredictDf.select(preProb, trueLabel).map(x => ( x(0).asInstanceOf[Vector].toArray, x(0).asInstanceOf[Vector].toArray(0), x(0).asInstanceOf[Vector].toArray(1) , x(1).toString.toDouble)) .toDF("probVector","prob0","prob1",trueLabel)aucDf.show()
+--------------------+-------------------+-------------------+-----+| probVector| prob0| prob1|label|+--------------------+-------------------+-------------------+-----+|[0.62708601946512...| 0.6270860194651235| 0.3729139805348764| 0.0||[0.25162713725554...| 0.2516271372555436| 0.7483728627444565| 1.0||[0.60308343184906...| 0.6030834318490638| 0.3969165681509361| 0.0||[0.56140826798745...| 0.561408267987451| 0.4385917320125489| 1.0||[0.78006275423495...| 0.780062754234951|0.21993724576504908| 0.0||[0.11822151224039...|0.11822151224039172| 0.8817784877596083| 1.0||[0.89776002662622...| 0.8977600266262256|0.10223997337377445| 0.0||[0.12412868907566...|0.12412868907566986| 0.8758713109243301| 1.0||[0.53044337453190...| 0.5304433745319013|0.46955662546809873| 1.0||[0.35865652594949...| 0.3586565259494905| 0.6413434740505095| 0.0|+--------------------+-------------------+-------------------+-----+aucDf: org.apache.spark.sql.DataFrame = [probVector: array
import org.apache.spark.mllib.evaluation.BinaryClassificationMetricsval predictionRDD1 = aucDf.select("prob1", trueLabel).as[(Double, Double)].rddval BinaryMetrics = new BinaryClassificationMetrics(predictionRDD1)println("AUC: " + BinaryMetrics.areaUnderROC)println("PRC: " + BinaryMetrics.areaUnderPR)
AUC: 0.92PRC: 0.9183333333333333import org.apache.spark.mllib.evaluation.BinaryClassificationMetricspredictionRDD1: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[1127] at rdd at
KS
简单介绍KS计算的逻辑:
阈值以0.1为单位,从0递增至1根据不同的阈值分别计算TPR、FPRkS = Max(TPR-FPR)
(0.0 to 1.0 by 0.1).toArray
res102: Array[Double] = Array(0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1.0)
aucDf.filter(s"prob1 >= 0.1 and $trueLabel == 1").count().toDouble
res104: Double = 5.0
import scala.collection.mutable.ArrayBufferval Tpr_Fpr = ArrayBuffer[Double]()var tp=0.0var fp=0.0var tn=0.0var fn=0.0for(threshold <- 0.0 to 1.0 by 0.1){ //判为正类实际也为正类 tp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 1").count().toDouble //判为正类实际为负类 fp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 0").count().toDouble //判为负类实际为负类 tn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 0").count().toDouble //判为负类实际为正类 fn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 1").count().toDouble Tpr_Fpr.append(tp/(tp+fn)-fp/(fp+tn))}println("KS Value: "+Tpr_Fpr.max)
KS Value: 0.8import scala.collection.mutable.ArrayBufferTpr_Fpr: scala.collection.mutable.ArrayBuffer[Double] = ArrayBuffer(0.0, 0.0, 0.19999999999999996, 0.4, 0.8, 0.39999999999999997, 0.39999999999999997, 0.6, 0.4, 0.0, 0.0)tp: Double = 0.0fp: Double = 0.0tn: Double = 5.0fn: Double = 5.0
封装
把上面所有的指标封装起来,方便调用。指标不是很多,调用方法是print所有的指标
/** * * @param spark * @param PredictDf 原始的预测数据集,不用对probVector做split * @param preLabel 预测的label列名,prediction * @param trueLabel 真实label列名,默认label */ def BinaryClassificationModelMectrics3(spark: SparkSession, PredictDf: DataFrame,probName:String="probability" , preLabel: String = "prediction", trueLabel: String = "label"): Unit = { import spark.implicits._ println("--------------------------------------- Confusion Matrix ------------------------------------------------") // --- 统计TP、FP、FN、TN // 实际为正,预测为正 val TP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 1").count().toDouble // 实际为负,预测为正 val FP = PredictDf.filter(s"$preLabel == 1 and $trueLabel == 0").count().toDouble // 实际为负,预测为负 val TN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 0").count().toDouble // 实际为正,预测为负 val FN = PredictDf.filter(s"$preLabel == 0 and $trueLabel == 1").count().toDouble // 自己拼的混淆矩阵,中文jupyter有bug就不写了。。。 println("\t " + "Pre Neg " + "Pre Pos " + "\n" + "True Neg " + TN + " " + FP + "\n" + "True Pos " + FN + " " + TP + "\n") // 直接groupBy PredictDf.groupBy(s"$trueLabel") .pivot(s"$preLabel", (0 to 1)).count().na.fill(0.0) .orderBy(asc(s"$trueLabel")) .withColumnRenamed(s"$trueLabel", "True-Predict") .show(truncate = true) //------ 计算accuracy、recall、precision、f1score println("---------------------------------- Accuarcy&Precision&Recall&F1Score ------------------------------------") println("---------------------- Use Package") val predictionRDD = PredictDf.select(preLabel, trueLabel).as[(Double, Double)].rdd val multiclassMetrics = new MulticlassMetrics(predictionRDD) println("Accuaracy: " + multiclassMetrics.accuracy) val labels = multiclassMetrics.labels labels.foreach { l => println(s"Precision($l) = " + multiclassMetrics.precision(l)) println(s"Recall($l) = " + multiclassMetrics.recall(l)) println(s"F1Score($l) = " + multiclassMetrics.fMeasure(l)) } println("---------------------- Not Use Package") println("Accuaracy: " + (TP + TN) / (TP + TN + FP + FN)) // 负样本 println("Neg precision: " + TN / (TN + FN)) println("Neg recall: " + TN / (FP + TN)) println("Neg F1Score: " + 2 * TN / (2 * TN + FP + FN)) // 正样本 println("Pos precision: " + TP / (TP + FP)) println("Pos recall: " + TP / (TP + FN)) println("Pos F1Score: " + 2 * TP / (2 * TP + FP + FN)) println("-----------------------------*----------- Auc&Prc&Ks ----------------------------------------------------") //---- 计算auc prc val aucDf = PredictDf.select(probName, trueLabel).map(x => ( x(0).asInstanceOf[Vector].toArray, x(0).asInstanceOf[Vector].toArray(0), x(0).asInstanceOf[Vector].toArray(1) , x(1).toString.toDouble)) .toDF("probVector","prob0","prob1",trueLabel) val predictionRDD1 = aucDf.select("prob1", trueLabel).as[(Double, Double)].rdd val BinaryMetrics = new BinaryClassificationMetrics(predictionRDD1) println("AUC: " + BinaryMetrics.areaUnderROC) println("PRC: " + BinaryMetrics.areaUnderPR) // ---KS val Tpr_Fpr = ArrayBuffer[Double]() var tp=0.0 var fp=0.0 var tn=0.0 var fn=0.0 for(threshold <- 0.0 to 1.0 by 0.1){ //判为正类实际也为正类 tp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 1").count().toDouble //判为正类实际为负类 fp = aucDf.filter(s"prob1 >= $threshold and $trueLabel == 0").count().toDouble //判为负类实际为负类 tn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 0").count().toDouble //判为负类实际为正类 fn = aucDf.filter(s"prob1 < $threshold and $trueLabel == 1").count().toDouble Tpr_Fpr.append(tp/(tp+fn)-fp/(fp+tn)) } println("KS Value: "+Tpr_Fpr.max) }
BinaryClassificationModelMectrics3: (spark: org.apache.spark.sql.SparkSession, PredictDf: org.apache.spark.sql.DataFrame, probName: String, preLabel: String, trueLabel: String)Unit
BinaryClassificationModelMectrics3(spark=spark, PredictDf=PredictDf,probName="prob" , preLabel = "preLabel", trueLabel= "label")
--------------------------------------- Confusion Matrix ------------------------------------------------ Pre Neg Pre Pos True Neg 4.0 1.0True Pos 2.0 3.0+------------+---+---+|True-Predict| 0| 1|+------------+---+---+| 0| 4| 1|| 1| 2| 3|+------------+---+---+---------------------------------- Accuarcy&Precision&Recall&F1Score ---------------------------------------------------------- Use PackageAccuaracy: 0.7Precision(0.0) = 0.6666666666666666Recall(0.0) = 0.8F1Score(0.0) = 0.7272727272727272Precision(1.0) = 0.75Recall(1.0) = 0.6F1Score(1.0) = 0.6666666666666665---------------------- Not Use PackageAccuaracy: 0.7Neg precision: 0.6666666666666666Neg recall: 0.8Neg F1Score: 0.7272727272727273Pos precision: 0.75Pos recall: 0.6Pos F1Score: 0.6666666666666666-----------------------------*----------- Auc&Prc&Ks ----------------------------------------------------AUC: 0.92PRC: 0.9183333333333332KS Value: 0.8
2020-03-25 于南京市江宁区九龙湖
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~