我们直接使用的ML的包对GBDT/LR进行融合
首先我们需要导入的包如下所示:
import org.apache.spark.sql. Row
import scala.collection.mutable
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, FeatureType, Strategy}
import org.apache.spark.mllib.tree.model.Node
import com.suning.aps.util.handle_data.deleteHDFS
import com.suning.aps.utils.StringUtil.getSign
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.types._
val spark = SparkSession.builder().appName("wap_cpc_searchApp_v")
.enableHiveSupport().getOrCreate()
import spark.implicits._
val instance = "yuntu/cpc_ctr/hotmai_02&20"
val statis_day = "20191207"
val statis_day_before = "20191205"
val parallels =50deleteHDFS(instance + "/wap_cpc_searchApp_v/" + statis_day, spark.sparkContext)val pv_click_searchApp1 = spark.sql(
s"""select
|case when length(ta.query)=0 then '-' else ta.query end as query,
|case when length(ta.ideaid)=0 then '00000000000000000-'
|when ta.ideaid is NULL then '00000000000000000-'
|else lpad(ta.ideaid,18,'0') end as ideaid,
|case when length(ta.terminal)=0 then '-' else ta.terminal end as terminal,""".stripMargin)
.select("is_click","query","terminal","query_brand_name","query_third_categ","userid"
,"ideaid","idea_first_categ","idea_second_categ",
"idea_third_categ","idea_brand_name").rdd.map(l=>{Row(
l(0).toString.toDouble, getSign(l(1).toString).toString.toDouble, getSign(l(3).toString).toString.toDouble,
getSign(l(4).toString).toString.toDouble,
getSign(l(5).toString).toString.toDouble,
getSign(l(6).toString).toString.toDouble,
getSign(l(7).toString).toString.toDouble,
getSign(l(8).toString).toString.toDouble,
getSign(l(9).toString).toString.toDouble,
getSign(l(10).toString).toString.toDouble)})
//由于使用MAP转化了数据(原先数据很多是离散变量,我们需要进行哈希编码)所以我们先转化为DATAFRAME
val ScoreSchema=StructType(mutable.ArraySeq(
StructField("y",DoubleType,nullable=false),
StructField("x1",DoubleType,nullable=false),StructField("x2",DoubleType,nullable=false),
StructField("x3",DoubleType,nullable=false),StructField("x4",DoubleType,nullable=false),
StructField("x5",DoubleType,nullable=false),StructField("x6",DoubleType,nullable=false),
StructField("x7",DoubleType,nullable=false),StructField("x8",DoubleType,nullable=false),
StructField("x9",DoubleType,nullable=false)
))
val df=spark.createDataFrame(pv_click_searchApp1,ScoreSchema)
val ignored = List("y")
val featInd = df.columns.diff(ignored).map(df.columns.indexOf(_))// Get index of target
val targetInd = df.columns.indexOf("y")val ds=df.rdd.map(r => LabeledPoint(
r.getDouble(targetInd), // Get target value
// Map feature indices to values
Vectors.dense(featInd.map(r.getDouble(_)).toArray)
))//训练GBDT
val numTrees = 2
// Train a GradientBoostedTrees model.
val boostingStrategy = BoostingStrategy.defaultParams("Classification")
boostingStrategy.numIterations = 10
boostingStrategy.treeStrategy.numClasses = 2
boostingStrategy.treeStrategy.maxDepth = 3
boostingStrategy.learningRate = 0.3
boostingStrategy.setNumIterations(numTrees)
// Empty categoricalFeaturesInfo indicates all features are continuous.
boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()val model = GradientBoostedTrees.train(ds, boostingStrategy)
val treeLeafArray = new Array[Array[Int]](numTrees)for (i <- 0.until(numTrees)) {
treeLeafArray(i) = getLeafNodes(model.trees(i).topNode)
}
for (i <- 0.until(numTrees)) {
println("正在打印第%d 棵树的 topnode 叶子节点", i)
for (j <- 0.until(treeLeafArray(i).length)) {
println(j)
}}
def getLeafNodes(node: Node): Array[Int] = {
var treeLeafNodes = new Array[Int](0)
if (node.isLeaf) {
treeLeafNodes = treeLeafNodes.:+(node.id)
} else {
treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.leftNode.get)
treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.rightNode.get)
}
treeLeafNodes
}// predict decision tree leaf's node value
def predictModify(node: Node, features: DenseVector): Int = {
val split = node.split
if (node.isLeaf) {
node.id
} else {
if (split.get.featureType == FeatureType.Continuous) {
if (features(split.get.feature) <= split.get.threshold) {
//println("Continuous left node")
predictModify(node.leftNode.get, features)
} else {
//println("Continuous right node")
predictModify(node.rightNode.get, features)
}
} else {
if (split.get.categories.contains(features(split.get.feature))) {
//println("Categorical left node")
predictModify(node.leftNode.get, features)
} else {
//println("Categorical right node")
predictModify(node.rightNode.get, features)
}
}
}
}
//gbdt 构造新特征
val newFeatureDataSet = df.rdd.map { x =>
(x(0).toString().toDouble, new DenseVector(Array(x(1).toString().toDouble, x(2).toString().toDouble, x(3).toString().toDouble,
x(4).toString().toDouble, x(5).toString().toDouble, x(6).toString().toDouble
, x(7).toString().toDouble, x(8).toString().toDouble, x(9).toString().toDouble)))
}.map { x =>
var newFeature = new Array[Double](0)
for (i <- 0.until(numTrees)) {
val treePredict = predictModify(model.trees(i).topNode, x._2)
//gbdt tree is binary tree
val treeArray = new Array[Double]((model.trees(i).numNodes + 1) / 2)
treeArray(treeLeafArray(i).indexOf(treePredict)) = 1
newFeature = newFeature ++ treeArray
}
(x._1, newFeature)
}newFeatureDataSet.take(2).foreach(println)val newData = https://www.it610.com/article/newFeatureDataSet.map(x => LabeledPoint(x._1, new DenseVector(x._2)))newData.take(2).foreach(println)
val splits2 = newData.randomSplit(Array(0.8, 0.2))
val train2 = splits2(0)
val test2 = splits2(1)var predictions = ds.map(lp => model.predict(lp.features))
predictions.take(10).foreach(println)
var predictionAndLabel = predictions.zip( ds.map(_.label))
var accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2 ).count/ds.count
println("GBTR accuracy " + accuracy)val model1 = new LogisticRegressionWithLBFGS().setNumClasses(2).run(train2).setThreshold(0.01)
model1.weights
val predictionAndLabels = test2.map { case LabeledPoint(label, features) =>
val prediction = model1.predict(features)
(prediction, label)
}
val metrics = new MulticlassMetrics(predictionAndLabels)
val precision = metrics.accuracy
println("Precision = " + precision)
上一篇:分别判断了a、b两数的0-1
下一篇:闪闪发光的少年