Kaggle笔记

xiaoxiao2021-11-30 20

kaggle比赛： https://www.kaggle.com/competitions 在这里可以看到一些机器学习中数据分析的问题，同时在kernels中可以看到哪些技术能够比较好的解决对应的经验贴： https://zhuanlan.zhihu.com/p/22266330 预测titanic上的人是否会幸存： https://www.kaggle.com/c/titanic/ 使用spark的Random Forest: package com.jd import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import java.lang.Math import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.rdd.RDD import scala.util.Random /** * Created by zhangwj on 16-9-7. * * */ object testTitanic { val conf = new SparkConf().setAppName("testTitanic").setMaster("local[2]") val sc = new SparkContext(conf) def getTrainData(data:RDD[Array[String]],sexFlag:Double,fareFlag:Double): RDD[LabeledPoint] ={ data.map(x=>{ val survival = x(1) val passengerId = x(0) val pclass = if(x(2).contentEquals(""))(Math.random()*3) else x(2).toDouble val sex = if(x(4).contentEquals("male"))1.0 else if(x(4).contentEquals("female"))0.0 else sexFlag //取人数最少的性别 val age = if(x(5).isEmpty)(Math.random()*100) else x(5).toDouble //随机取年龄值 val sibSp = if(x(6).contentEquals(""))(Math.random()*8) else x(6).toDouble val parch = if(x(7).contentEquals(""))(Math.random()*6) else x(7).toDouble // val ticket = x(8).toDouble val fare = if(x(9).contentEquals(""))fareFlag else x(9).toDouble // val cabin = val embarked = if(x.length<=11)Math.random()*2 else if(x(11).contentEquals("C"))0.0 else if(x(11).contentEquals("S")) 1.0 else 2.0 val features:Array[Double] = Array(passengerId.toDouble,pclass.toDouble,sex.toDouble,age.toDouble,sibSp.toDouble,parch.toDouble,fare.toDouble,embarked.toDouble) LabeledPoint(survival.toDouble,Vectors.dense(features)) }) } def getTestData(data:RDD[Array[String]],sexFlag:Double,fareFlag:Double) ={ data.map(x=>{ val passengerId = x(0) val pclass = if(x(1).contentEquals(""))(Math.random()*3) else x(1).toDouble val sex = if(x(3).contentEquals("male"))1.0 else if(x(3).contentEquals("female"))0.0 else sexFlag //取人数最少的性别 val age = if(x(4).isEmpty)(Math.random()*100) else x(4).toDouble //随机取年龄值 val sibSp = if(x(5).contentEquals(""))(Math.random()*8) else x(5).toDouble val parch = if(x(6).contentEquals(""))(Math.random()*6) else x(6).toDouble // val ticket = x(8).toDouble val fare = if(x(8).contentEquals(""))fareFlag else x(8).toDouble // val cabin = val embarked = if(x.length<=10)Math.random()*2 else if(x(10).contentEquals("C"))0.0 else if(x(10).contentEquals("S")) 1.0 else 2.0 val features:Array[Double] = Array(passengerId.toDouble,pclass.toDouble,sex.toDouble,age.toDouble,sibSp.toDouble,parch.toDouble,fare.toDouble,embarked.toDouble) Vectors.dense(features) }) } def loadModel(): Unit ={ val sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel") println("Learned classification forest model:\n" + sameModel.toDebugString) } def trainAndPredict(): Unit ={ //注意实现要将train.csv和test.csv中的name字段中的逗号替换 //第一行的标题也要删除 val org_data = sc.textFile("src/main/resources/train.csv").map(line=>line.split(",")) val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) val sexCount = org_data.map(x=>(x(4),1)).reduceByKey(_+_).map(x=>(x._1,x._2)).sortBy(_._2,false).first()._1 val sexFlag = if(sexCount.contentEquals("male"))0.0 else if(sexCount.contentEquals("female"))1.0 else Math.random()*1 val fareFlag = org_data.map(x=>{ if(!x(9).contentEquals(""))x(9).toDouble else 0}).mean() val trainingData = getTrainData(org_data,sexFlag,fareFlag) import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel import org.apache.spark.mllib.util.MLUtils // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. val numClasses = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 10 // Use more in practice. 10棵树增加到100棵，成绩反而下降 val featureSubsetStrategy = "auto" // Let the algorithm choose. val impurity = "entropy" val maxDepth = 4 val maxBins = 32 val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) // Save and load model model.save(sc, "target/tmp/myRandomForestClassificationModel") val test_data = sc.textFile("src/main/resources/test.csv").map(line=>line.split(",")) val testData = getTestData(test_data,sexFlag,fareFlag) val labelAndPreds = testData.map { point => val prediction = model.predict(point) (point(0), prediction) } labelAndPreds.coalesce(1).map(x=>x._1.toInt+","+x._2.toInt).saveAsTextFile("target/test.csv") } def main(args: Array[String]) { // trainAndPredict() loadModel() } } 前期需要自己去读train.csv和test.csv文件，两个里面都包含缺失值（代码中包含了处理缺失值），并且name字段中有逗号出现（实现将逗号替换为:，这种只适合数据量小，如果数据量大使用专门csv包）。第一行是标题也要删除。第一次提交使用的是10棵树，得分 0.80383 第二次将10棵树增加到100棵，结果准确率降低了。使用GBDT模型，结果没有得到改善（ 0.75120）。使用SVM模型，结果是 0.37321 使用DNN：基于tensorflow的skflow，准确率都是0.74左右，其中采取的策略，使用的tanh激活函数，并且尝试了把每个特征都取log作为一个新的特征加入的原来的训练集中（相当于特征增加一倍），结果并没有显著提高，尝试把passengerId加入到特征中，准确率大大降低。之所以比不过RF的原因可能是训练集的数据量太小，不足以训练出较好的模型，样本数要远大于参数个数，比如一个3*10*2的全连接网络，参数个数就为3*10+10*2＝50个，如果样本的个数50个，那么远远不足以训练模型。下一次尝试将 log增加的特征加入到RandomForest中，加入新的特征后，成绩反而也降低了。使用sklearn的Random Forest，entropy要优于gini。

转载请注明原文地址: https://ju.6miu.com/read-678935.html

专利

最新回复(0)