package fst;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
class FSTDic{
FST fst ;
FST.BytesReader fstReader;
public FSTDic() throws IOException{
File file=new File("fst");
if(file.exists()){
fst=load(file);
}else{
List words=new ArrayList();
words.add("中国");
words.add("中国人");
words.add("中国人民");
words.add("中国人民解放军");
fst=build(words);
}
fstReader = fst.getBytesReader();
}
public void save() throws IOException{
fst.save(new File("fst"));
}
public FST load(File file) throws IOException{
return new FST(new InputStreamDataInput(new FileInputStream("fst")),ByteSequenceOutputs.getSingleton() );
}
private FST build(List words) throws IOException{
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, outputs);
final IntsRef scratchIntsRef = new IntsRef();
BytesRef output = new BytesRef(4);
for(String word: words){
NumericUtils.intToPrefixCodedBytes(word.length(), 0, output);
builder.add(Util.toUTF32(word, scratchIntsRef), BytesRef.deepCopyOf(output));
}
return builder.finish();
}
public boolean contains(String word) throws IOException{
FST.Arc scratchArc = new FST.Arc();
int bufUpto=0,buflen=word.length();
BytesRef pendingOutput=fst.outputs.getNoOutput();
BytesRef matchOutput = null;
fst.getFirstArc(scratchArc);
while(bufUpto
int codePoint=Character.codePointAt(word,bufUpto);
if(fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader)!=null){
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
}else{
break;
}
bufUpto += Character.charCount(codePoint);
}
if(scratchArc.isFinal()){
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
int len= NumericUtils.prefixCodedToInt(matchOutput);
System.out.println(len);
return true;
}
return false;
}
public static void main(String[] args) throws IOException {
FSTDic dic=new FSTDic();
//dic.save();
System.out.println(dic.contains("中国"));
System.out.println(dic.contains("中国人"));
System.out.println(dic.contains("中国人民"));
System.out.println(dic.contains("中国人民解放军"));
}
}
(随记,稍后补全……)