[NLP]OpenNLP文档分类器的使用

姚臻

2023-12-01

Document Categorizer

模型训练

文档分类

Document Categorizer

文档分类程序可以将文本分类为预定义的类别。它基于最大熵框架。

模型训练


import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import opennlp.tools.doccat.DoccatModel;

import opennlp.tools.doccat.DocumentCategorizerME;

public class DocumentCategoriesPredit {



    public static void main(String[] args) throws IOException {

       // TODO Auto-generated method stub

       String rootDir = System.getProperty("user.dir") + File.separator;

      

       String fileResourcesDir = rootDir + "resources" + File.separator;

       String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;

      

       //String filePath = fileResourcesDir + "sentenceDetector.txt";

       String modelPath = modelResourcesDir + "en-documentCategorizer-my.bin";



       InputStream modelIn = new FileInputStream(modelPath) ;

       //加载模型

        DoccatModel model = new  DoccatModel(modelIn);

       //实例化模型

        DocumentCategorizerME docCategorizer  = new DocumentCategorizerME(model);

      

        

       //文档分类检测，返回的是一个概率数组

        double[] bProbs= docCategorizer.categorize(new String[]{"x", "y", "z"});

        System.out.println("最合适的分类："+docCategorizer.getBestCategory(bProbs));

        System.out.println("所有可能的分类："+docCategorizer.getAllResults(bProbs));

       for(int i=0;i<bProbs.length;i++){

           System.out.println("分类："+docCategorizer.getCategory(i)+";概率："+bProbs[i]);

       }

    }

}

文档分类

import java.io.BufferedOutputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStream;

import java.nio.charset.StandardCharsets;

import opennlp.tools.cmdline.doccat.DoccatFineGrainedReportListener;

import opennlp.tools.doccat.DoccatFactory;

import opennlp.tools.doccat.DoccatModel;

import opennlp.tools.doccat.DocumentCategorizerEvaluator;

import opennlp.tools.doccat.DocumentCategorizerME;

import opennlp.tools.doccat.DocumentSample;

import opennlp.tools.util.InputStreamFactory;

import opennlp.tools.util.MarkableFileInputStreamFactory;

import opennlp.tools.util.ObjectStream;

import opennlp.tools.util.ObjectStreamUtils;

import opennlp.tools.util.PlainTextByLineStream;

import opennlp.tools.util.TrainingParameters;



public class DocumentCategorizerTrain {



    public static void main(String[] args) throws IOException {

       // TODO Auto-generated method stub

       String rootDir = System.getProperty("user.dir") + File.separator;

      

       String fileResourcesDir = rootDir + "resources" + File.separator;

       String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;

      

       //训练数据的路径

        String filePath = fileResourcesDir + "tokenizer.txt";

       //训练后模型的保存路径

        String modelPath = modelResourcesDir + "en-documentCategorizer-my.bin";

          

           //按行读取数据

       InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));

       ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);

      

       //按行读取数据

         ObjectStream<DocumentSample> sampleStream = ObjectStreamUtils.createObjectStream(

                    new DocumentSample("c1", new String[]{"a", "b", "c"}),

                    new DocumentSample("c1", new String[]{"a", "b", "c", "gg", "rr"}),

                    new DocumentSample("c1", new String[]{"a", "b", "c", "ee", "rr"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z", "nn", "kk"}),

                    new DocumentSample("c2", new String[]{"x", "y", "z", "ff", "cc"}));

        

         TrainingParameters params = new TrainingParameters();

            params.put(TrainingParameters.ITERATIONS_PARAM, 200);

            params.put(TrainingParameters.CUTOFF_PARAM, 0);

        

            DoccatFactory factory =new DoccatFactory();



       //训练模型

       DoccatModel model =DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(),factory);

        

       //保存模型

       FileOutputStream fos=new FileOutputStream(new File(modelPath));

        OutputStream modelOut = new BufferedOutputStream(fos);

        model.serialize(modelOut);

        

        //评估模型

        DocumentCategorizerEvaluator evaluator=new DocumentCategorizerEvaluator(new DocumentCategorizerME(model),new DoccatFineGrainedReportListener());

        evaluator.evaluate(sampleStream);

       System.out.println("正确率："+ evaluator.getAccuracy());

    }

}

[NLP]OpenNLP文档分类器的使用

Document Categorizer

模型训练

文档分类

相关阅读

相关文章

相关问答

相关文档