分块是将文章的内容分成句法相关的词组,如名词组、动词组,但不指定它们的内部结构,也不说明它们在主句中的作用。
训练数据的输入格式如下:
Rockwell NNP B-NP
International NNP I-NP
Corp. NNP I-NP
's POS B-NP
Tulsa NNP I-NP
unit NN I-NP
said VBD B-VP
it PRP B-NP
signed VBD B-VP
a DT B-NP
tentative JJ I-NP
agreement NN I-NP
extending VBG B-VP
its PRP$ B-NP
contract NN I-NP
with IN B-PP
Boeing NNP B-NP
Co. NNP I-NP
to TO B-VP
provide VB I-VP
structural JJ B-NP
parts NNS I-NP
for IN B-PP
Boeing NNP B-NP
's POS B-NP
747 CD I-NP
jetliners NNS I-NP
标注说明:
用chunker分块后的标志由两部分组成:块在原句中的位置-词性
如B-NP的B表示该词在句子开始的位置,NP表示名词;I-NP的I表示该词在句中中间的位置,NP表示名词。
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.chunker.ChunkSampleStream;
import opennlp.tools.chunker.ChunkerEvaluator;
import opennlp.tools.chunker.ChunkerFactory;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.FMeasure;
public class ChunkerTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "chunker.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "en-chunker-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(lineStream);
ChunkerFactory factory =new ChunkerFactory();
//训练模型
ChunkerModel model =ChunkerME.train("en",sampleStream, TrainingParameters.defaultParams(), factory);
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
//模型评估
ChunkerEvaluator evaluator=new ChunkerEvaluator(new ChunkerME(model));
FMeasure fm=evaluator.getFMeasure();
System.out.println("FMeasure:"+fm.getFMeasure()+";precision="+fm.getPrecisionScore()+";recall"+fm.getRecallScore());
}
}
```java
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.util.Sequence;
public class ChunkerPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "en-chunker.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
ChunkerModel model = new ChunkerModel(modelIn);
//实例化模型
ChunkerME chunker = new ChunkerME(model);
//分块检测
String sent[] = new String[] { "Rockwell", "International", "Corp.", "'s",
"Tulsa", "unit", "said", "it", "signed", "a", "tentative", "agreement",
"extending", "its", "contract", "with", "Boeing", "Co.", "to",
"provide", "structural", "parts", "for", "Boeing", "'s", "747",
"jetliners", "." };
String pos[] = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN",
"VBD", "PRP", "VBD", "DT", "JJ", "NN", "VBG", "PRP$", "NN", "IN",
"NNP", "NNP", "TO", "VB", "JJ", "NNS", "IN", "NNP", "POS", "CD", "NNS",
"." };
String tag[] = chunker.chunk(sent, pos);
//获取概率参数
double chunkerProbs[] = chunker.probs();
for(String str:tag){
System.out.print(str+",");
}
System.out.println();
for(double pro:chunkerProbs){
System.out.print(pro+",");
}
Sequence[] sentens = chunker.topKSequences(sent, pos);
System.out.println();
for(Sequence se:sentens){
System.out.print(se+",");
}
}
}