目录
文档分类程序可以将文本分类为预定义的类别。它基于最大熵框架。
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
public class DocumentCategoriesPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "en-documentCategorizer-my.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
DoccatModel model = new DoccatModel(modelIn);
//实例化模型
DocumentCategorizerME docCategorizer = new DocumentCategorizerME(model);
//文档分类检测,返回的是一个概率数组
double[] bProbs= docCategorizer.categorize(new String[]{"x", "y", "z"});
System.out.println("最合适的分类:"+docCategorizer.getBestCategory(bProbs));
System.out.println("所有可能的分类:"+docCategorizer.getAllResults(bProbs));
for(int i=0;i<bProbs.length;i++){
System.out.println("分类:"+docCategorizer.getCategory(i)+";概率:"+bProbs[i]);
}
}
}
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.cmdline.doccat.DoccatFineGrainedReportListener;
import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerEvaluator;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class DocumentCategorizerTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "tokenizer.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "en-documentCategorizer-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<DocumentSample> sampleStream = ObjectStreamUtils.createObjectStream(
new DocumentSample("c1", new String[]{"a", "b", "c"}),
new DocumentSample("c1", new String[]{"a", "b", "c", "gg", "rr"}),
new DocumentSample("c1", new String[]{"a", "b", "c", "ee", "rr"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z"}),
new DocumentSample("c2", new String[]{"x", "y", "z", "nn", "kk"}),
new DocumentSample("c2", new String[]{"x", "y", "z", "ff", "cc"}));
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 200);
params.put(TrainingParameters.CUTOFF_PARAM, 0);
DoccatFactory factory =new DoccatFactory();
//训练模型
DoccatModel model =DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(),factory);
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
//评估模型
DocumentCategorizerEvaluator evaluator=new DocumentCategorizerEvaluator(new DocumentCategorizerME(model),new DoccatFineGrainedReportListener());
evaluator.evaluate(sampleStream);
System.out.println("正确率:"+ evaluator.getAccuracy());
}
}