import fregata.spark.data.LibSvmReader
import fregata.spark.metrics.classification.{AreaUnderRoc, Accuracy}
import fregata.spark.model.classification.LogisticRegression
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by ALL on 2016/12/8.
*/
object FregataFirstTest {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("test").setMaster("local")
val sc=new SparkContext(conf)
//通过Fregata的API读取数据
val (_,trainData)=LibSvmReader.read(sc,"/fregataData/a9a",123)
val (_,testData)=LibSvmReader.read(sc,"/fregataData/a9a.t",123)
//使用训练数据构建逻辑回归模型
val model=LogisticRegression.run(trainData)
//使用测试数据预测类别
val pd=model.classPredict(testData)
//通过AUC或者其他指标来评估模型
val acc=Accuracy.of( pd.map{
case ((x,l),(p,c))=>
c -> l
})
val auc=AreaUnderRoc.of( pd.map{
case ((x,l),(p,c))=>
p -> l
})
val loss = fregata.spark.loss.log(pd.map{
case ((x,l),(p,c)) =>
if( l == 1d ) {
(l,c,p)
}else{
( l , c , 1-p )
}
})
println( s"AreaUnderRoc = $auc ")
println( s"Accuracy = $acc ")
println( s"LogLoss = $loss ")
}
}
Fregata训练数据的API需要数据类型为RDD[(fregata.Vector,fregata.Num)],对于预测的API需要和训练数据类型相同或者是RDD[fregata.Vector]
的数据格式