当前位置: 首页 > 工具软件 > CLucene > 使用案例 >

CLucene中StandardAnalyzer分词流程

夏立果
2023-12-01

Clucene中StandardAnalyzer用到了职责链模式,相关代码如下:

class CLUCENE_EXPORT StandardTokenizer: public Tokenizer {

 Token* next(Token* token);

}

class CLUCENE_EXPORT StandardFilter: public TokenFilter{
    public:
        // Construct filtering <i>in</i>. 
        StandardFilter(TokenStream* in, bool deleteTokenStream);

        virtual ~StandardFilter();

     
        Token* next(Token* token);
    };

class CLUCENE_EXPORT LowerCaseFilter: public TokenFilter {
public:
    LowerCaseFilter(TokenStream* in, bool deleteTokenStream);
    virtual ~LowerCaseFilter();
    Token* next(Token* token);
};

class CLUCENE_EXPORT StopFilter: public TokenFilter {

    StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** _stopWords, const bool _ignoreCase = false);

    virtual ~StopFilter();
    Token* next(Token* token);

}

使用者:

TokenStream* StandardAnalyzer::tokenStream(const TCHAR* /*fieldName*/, Reader* reader)
    {
        BufferedReader* bufferedReader = reader->__asBufferedReader();
        TokenStream* ret;

        if ( bufferedReader == NULL )
            ret = _CLNEW StandardTokenizer( _CLNEW FilteredBufferedReader(reader, false), true );
        else
            ret = _CLNEW StandardTokenizer(bufferedReader);
        //ret->setMaxTokenLength(maxTokenLength);
        ret = _CLNEW StandardFilter(ret,true);
        ret = _CLNEW LowerCaseFilter(ret,true);
        ret = _CLNEW StopFilter(ret,true, stopSet);
        return ret;
    }

最后返回的是StopFilter对象

当客户端:

while ( pStream->next(&t) != NULL ) 
 {               
  }调用 ,StopFilter对象最先调用

StandardAnalyzer 类主要工作Token* StandardTokenizer::next(Token* t) 进行分词

Token* StandardFilter::next(Token* t) 主要是规范化 去掉.及's这样的

Token* LowerCaseFilter::next(Token* t)  大小写格式化

Token* StopFilter::next(Token* token)  去掉停用词

 

还有

TokenStream* StandardAnalyzer::reusableTokenStream(const TCHAR* fieldName, Reader* reader){
        SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
        if (streams == NULL) {
            streams = _CLNEW SavedStreams();
            setPreviousTokenStream(streams);

            BufferedReader* bufferedReader = reader->__asBufferedReader();
            if ( bufferedReader == NULL )
                streams->tokenStream = _CLNEW StandardTokenizer( _CLNEW FilteredBufferedReader(reader, false), true);
            else
                streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);

            streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
            streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
            streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
        } else {
            streams->tokenStream->reset(reader);
        }
        //streams->tokenStream->setMaxTokenLength(maxTokenLength);

        return streams->filteredTokenStream;
    }

 类似资料: