好的,我跳过了数据帧,创建了一个标签点数组,可以很容易地将其转换为RDD。剩下的很简单。
val mDF: DataFrame = spark.read.option("header", "true").option("inferSchema", "true").csv("src/test/resources/knimeMergedTRimmedVariables.csv")
val mDFTyped = castAllTypedColumnsTo(mDF, IntegerType, DoubleType)
val indexer = new StringIndexer()
.setInputCol("Majors_Final")
.setOutputCol("Majors_Final_Indexed")
val mDFTypedIndexed = indexer.fit(mDFTyped).transform(mDFTyped)
val mDFFinal = castColumnTo(mDFTypedIndexed,"Majors_Final_Indexed", IntegerType)
mDFFinal.show()
//only doubles accepted by sparse vector, so that's what we filter for
val fieldSeq: scala.collection.Seq[StructField] = mDFFinal.schema.fields.toSeq.filter(f => f.dataType == DoubleType)
val fieldNameSeq: Seq[String] = fieldSeq.map(f => f.name)
var positionsArray: ArrayBuffer[LabeledPoint] = ArrayBuffer[LabeledPoint]()
mDFFinal.collect().foreach
{
row => positionsArray+=convertRowToLabeledPoint(row,fieldNameSeq,row.getAs("Majors_Final_Indexed"));
}
val mRdd:RDD[LabeledPoint]= spark.sparkContext.parallelize(positionsArray.toSeq)
MLUtils.saveAsLibSVMFile(mRdd, "./output/libsvm")