转自:http://blog.csdn.net/bingfox/archive/2010/07/19/5745363.aspx
下面,我们将结合代码,对Clucene建立索引的过程进行剖析.
(一). main函数中调用建立索引的过程
(1).void IndexFiles()方法:
//参数:索引文件路径,索引后的目标路径
void IndexFiles(char* path, char* target, const bool clearIndex)
{
IndexWriter* writer = NULL;
lucene::analysis::standard::StandardAnalyzer an;
if (!clearIndex && IndexReader::indexExists(target)){
if (IndexReader::isLocked(target) ){ //在函数调用里面执行了创建了索引的目录
printf("Index was locked... unlocking it./n");
IndexReader::unlock(target);
}
writer = _CLNEW IndexWriter( target, &an, false);
}else{
writer = _CLNEW IndexWriter( target ,&an, true);
}
writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
writer->setUseCompoundFile(false); //设置不使用复合索引
uint64_t str = lucene::util::Misc::currentTimeMillis();
indexDocs(writer, path);
writer->optimize();
writer->close();
_CLDELETE(writer);
printf("Indexing took: %d ms./n/n", lucene::util::Misc::currentTimeMillis() - str);
}
(2).void IndexFiles()方法中调用indexDocs(writer, path)方法:
void indexDocs(IndexWriter* writer, char* directory)
{
DIR* dir = opendir(directory);
if ( dir != NULL ){
struct dirent* fl;
struct fileStat buf;
char path[CL_MAX_DIR];
strcpy(path,directory);
strcat(path,PATH_DELIMITERA);
char* pathP = path + strlen(path);
fl = readdir(dir);
while ( fl != NULL ){
if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {
pathP[0]=0;
strcat(pathP,fl->d_name);
int32_t ret = fileStat(path,&buf);
if ( buf.st_mode & S_IFDIR ) {
indexDocs(writer, path );
}else{
//处理目录下面的每个文档
Document* doc = FileDocument( path );
writer->addDocument(doc);
_CLDELETE(doc);
}
}
fl = readdir(dir);
}
closedir(dir);
}else{
printf( "adding: %s/n", directory);
Document* doc = FileDocument( directory );
writer->addDocument( doc );
_CLDELETE(doc);
}
(3). Document* FileDocument(const char* f)方法:
//先将字段加入到文档,在将文档加入到IndexWriter中
Document* FileDocument(const char* f)
{
Document* doc = _CLNEW Document();
TCHAR tf[CL_MAX_DIR];
STRCPY_AtoT(tf,f,CL_MAX_DIR);
doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );
FILE* fh = fopen(f,"r");
if ( fh != NULL ){
StringBuffer str;
int fn = fileno(fh);
struct stat filestat;
fstat(fn, &filestat);
str.reserve(filestat.st_size);
char abuf[1024];
TCHAR tbuf[1024];
size_t r;
//每次读取1023字节
do{
r = fread(abuf,1,1023,fh);
abuf[r]=0;
STRCPY_AtoT(tbuf,abuf,r);
tbuf[r]=0;
str.append(tbuf);
}while(r>0);
fclose(fh);
doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED|Field::TERMVECTOR_WITH_OFFSETS) );
}
return doc;
}
(二).进入建立索引的细节
(1). Document类与Field类
void Document::add(Field& field)
{
//刚开始创建时fieldList是空,然后加入下一个字段时,又不是为空了,这样就建立了next的关系
//新生成的fieldList都是在链表头部
fieldList = _CLNEW DocumentFieldEnumeration::DocumentFieldList(&field, fieldList);
}
文档字段迭代器DocumentFieldEnumeration类
//文档字段迭代器
class DocumentFieldEnumeration :LUCENE_BASE{
class DocumentFieldList :LUCENE_BASE{
public:
DocumentFieldList(Field* f, DocumentFieldList* n); //构造函数
~DocumentFieldList();
Field* field;
DocumentFieldList* next; //应该叫做之前的pre指针
};
friend class Document;
private:
const DocumentFieldList* fields;
public:
DocumentFieldEnumeration(const DocumentFieldList* fl);
~DocumentFieldEnumeration();
bool hasMoreElements() const;
Field* nextElement();
};
void Document::removeFields()方法:
//从链表中删除多个重名的字段
void Document::removeFields(const TCHAR* name)
{
CND_PRECONDITION(name != NULL, "name is NULL");
DocumentFieldEnumeration::DocumentFieldList* previous = NULL;
DocumentFieldEnumeration::DocumentFieldList* current = fieldList;
while (current != NULL) {
if ( _tcscmp(current->field->name(),name) == 0 ){
if (previous){
previous->next = current->next; //删除当前指针,修改指针指向
}else
fieldList = current->next;
current->next=NULL;
_CLDELETE(current);
if ( previous )
current = previous->next; //重新设置当前指针
else
current = fieldList;
}else{
previous = current;
current = current->next;
}
}
}
(2). void IndexWriter::addDocument()方法
void IndexWriter::addDocument(Document* doc, Analyzer* analyzer)
{
CND_PRECONDITION(ramDirectory != NULL,"ramDirectory is NULL");
if ( analyzer == NULL )
{
analyzer = this->analyzer;
}
ramDirectory->transStart();
try {
//每加入一个文档,就得到新的段名
char* segmentName = newSegmentName();
CND_CONDITION(segmentName != NULL, "segmentName is NULL");
try {
// ramDirectory:带事务的内存文件目录
DocumentWriter* dw = _CLNEW DocumentWriter(ramDirectory, analyzer, this );
CND_CONDITION(dw != NULL, "dw is NULL");
try {
dw->addDocument(segmentName, doc);
} _CLFINALLY(
_CLDELETE(dw);
);
//建立索引时加入一个文档,就生成一个新的段信息
SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory);
CND_CONDITION(si != NULL, "Si is NULL");
{
SCOPED_LOCK_MUTEX(THIS_LOCK)
segmentInfos->add(si);
//合并段
maybeMergeSegments();
}
} _CLFINALLY(
_CLDELETE_CaARRAY(segmentName);
);
} catch (...) {
ramDirectory->transAbort();
throw;
}
ramDirectory->transCommit();
}
(3).调用到的void DocumentWriter::addDocument()方法
//将文档加入到新段里面
void DocumentWriter::addDocument(const char* segment, Document* doc)
{
CND_PRECONDITION(fieldInfos==NULL, "fieldInfos!=NULL")
// write field names
fieldInfos = _CLNEW FieldInfos();
fieldInfos->add(doc);
//.fnm 写入字段名称的文件
const char* buf = Misc::segmentname(segment, ".fnm");
fieldInfos->write(directory, buf);
_CLDELETE_CaARRAY(buf);
// write field values
FieldsWriter fieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.addDocument(doc);
} _CLFINALLY(fieldsWriter.close());
//invert doc into postingTable
clearPostingTable(); // clear postingTable
//文档中字段的个数
size_t size = fieldInfos->size();
fieldLengths = _CL_NEWARRAY(int32_t,size); // init fieldLengths
fieldPositions = _CL_NEWARRAY(int32_t,size); // init fieldPositions
fieldOffsets = _CL_NEWARRAY(int32_t,size); // init fieldOffsets
memset(fieldPositions, 0, sizeof(int32_t) * size);
//initialise fieldBoost array with default boost
int32_t fbl = fieldInfos->size();
float_t fbd = doc->getBoost(); //初始是.0f;
fieldBoosts = _CL_NEWARRAY(float_t,fbl); // init fieldBoosts
{
for ( int32_t i=0;i<fbl;i++ )
fieldBoosts[i] = fbd;
}
{
for ( int32_t i=0;i<fieldInfos->size();i++ )
fieldLengths[i] = 0;
}
//进行倒排处理
invertDocument(doc);
// sort postingTable into an array
Posting** postings = NULL;
int32_t postingsLength = 0;
//对postingTable中的词条进行排序,返回一个排序的Posting[]数组
sortPostingTable(postings,postingsLength);
//write postings
//将经过排序的Posting[]数组写入到索引段文件中(segmentsv.frq文件和segments.prx文件)
writePostings(postings,postingsLength, segment);
//write norms of indexed fields
//写入被索引的Field的norm信息
writeNorms(segment);
_CLDELETE_ARRAY( postings );
}
(4). void FieldInfos::write()方法
//写入字段信息
void FieldInfos::write(IndexOutput* output) const
{
//首先写入字段个数
output->writeVInt(size());
FieldInfo* fi;
uint8_t bits;
for (int32_t i = 0; i < size(); ++i) {
fi = fieldInfo(i);
bits = 0x0;
if (fi->isIndexed) bits |= IS_INDEXED; //每个位的位置,如果两个操作数对应的位有一个或者两个都为,则该位为,否则为
if (fi->storeTermVector) bits |= STORE_TERMVECTOR;
if (fi->storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (fi->storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
if (fi->omitNorms) bits |= OMIT_NORMS;
output->writeString(fi->name,_tcslen(fi->name)); //写入字段名称以及长度
output->writeByte(bits); //写入一个字节
}
}
(5). void FieldsWriter::addDocument()方法
//写入字段值
void FieldsWriter::addDocument(Document* doc)
{
CND_PRECONDITION(indexStream != NULL,"indexStream is NULL");
CND_PRECONDITION(fieldsStream != NULL,"fieldsStream is NULL");
printf("%s=%d","fieldsStream->getFilePointer()",fieldsStream->getFilePointer());
//索引流写入字段流的位置指针
indexStream->writeLong(fieldsStream->getFilePointer());
int32_t storedCount = 0;
DocumentFieldEnumeration* fields = doc->fields();
while (fields->hasMoreElements()) {
Field* field = fields->nextElement();
if (field->isStored())
{
storedCount++;
}
}
_CLDELETE(fields);
//字段流写入存储索引的字段个数
fieldsStream->writeVInt(storedCount);
fields = doc->fields();
while (fields->hasMoreElements())
{
Field* field = fields->nextElement();
if (field->isStored())
{
//写入字段序号
fieldsStream->writeVInt(fieldInfos->fieldNumber(field->name()));
uint8_t bits = 0;
if (field->isTokenized())
bits |= FieldsWriter::FIELD_IS_TOKENIZED;
if (field->isBinary())
bits |= FieldsWriter::FIELD_IS_BINARY;
if (field->isCompressed())
bits |= FieldsWriter::FIELD_IS_COMPRESSED;
//写入一个字节:是否分词,是否是字节,是否压缩
fieldsStream->writeByte(bits);
if ( field->isCompressed() ){
_CLTHROWA(CL_ERR_Runtime, "CLucene does not directly support compressed fields. Write a compressed byte array instead");
}else{
if (field->isBinary()) {
jstreams::StreamBase<char>* stream = field->streamValue();
const char* sd;
//去读取
int32_t rl = stream->read(sd,10000000,0);
if ( rl < 0 ){
fieldsStream->writeVInt(0); }else{
fieldsStream->writeVInt(rl);
fieldsStream->writeBytes((uint8_t*)sd, rl);
}
}else if ( field->stringValue() == NULL ){ CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
Reader* r = field->readerValue();
const TCHAR* rv;
int64_t rl = r->read(rv, LUCENE_INT32_MAX_SHOULDBE);
if ( rl > LUCENE_INT32_MAX_SHOULDBE )
_CLTHROWA(CL_ERR_Runtime,"Field length too long");
else if ( rl < 0 )
rl = 0;
fieldsStream->writeString( rv, (int32_t)rl);
}else if ( field->stringValue() != NULL ){
//写入读取的字符串
fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));
}else
_CLTHROWA(CL_ERR_Runtime, "No values are set for the field");
}
}
}
_CLDELETE(fields);
}
(6). void DocumentWriter::invertDocument方法
//进行倒排处理
void DocumentWriter::invertDocument(const Document* doc)
{
DocumentFieldEnumeration* fields = doc->fields();
try {
while (fields->hasMoreElements())
{
Field* field = (Field*)fields->nextElement();
const TCHAR* fieldName = field->name();
const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
//初始时都是
int32_t length = fieldLengths[fieldNumber]; // length of field // 根据每个Field的编号,设置每个Field的长度
int32_t position = fieldPositions[fieldNumber]; // position in field // 根据每个Field的编号,设置每个Field的位置
if (length>0)
{
position+=analyzer->getPositionIncrementGap(fieldName);
}
int32_t offset = fieldOffsets[fieldNumber]; // offset field // 根据每个Field的编号,设置每个Field的offset
if (field->isIndexed())
{ // 如果Field被索引
if (!field->isTokenized())
{ // 如果Field没有进行分词
const TCHAR* charBuf = NULL;
int64_t dataLen = 0;
if (field->stringValue() == NULL && !field->isStored() )
{
CL_NS(util)::Reader* r = field->readerValue();
dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
if (dataLen == -1)
dataLen = 0;
} else {
charBuf = field->stringValue();
dataLen = _tcslen(charBuf);
}
// 是否把整个Field的数据作为一个词条存储到postingTable中
if(field->isStoreOffsetWithTermVector()){
TermVectorOffsetInfo tio;
tio.setStartOffset(offset);
tio.setEndOffset(offset + dataLen);
addPosition(fieldName, charBuf, position++, &tio );
}
else
{
addPosition(fieldName, charBuf, position++, NULL);
}
offset += dataLen; //偏移量在加上数据长度
length++;
} else { // field must be tokenized // 需要对Field进行分词
CL_NS(util)::Reader* reader; // find or make Reader
bool delReader = false;
if (field->readerValue() != NULL) { // 如果从Field获取的Reader数据不为null
reader = field->readerValue();
} else if (field->stringValue() != NULL) { // 根据从Field获取的字符串数据构造一个Reader输入流
reader = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);
delReader = true;
} else {
_CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
}
try {
// Tokenize field and add to postingTable.
// 把经过分词处理的Field加入到postingTable中
CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);
try
{
CL_NS(analysis)::Token t;
int32_t lastTokenEndOffset = -1; //上一个分词的终止位置
while (stream->next(&t))
{
position += (t.getPositionIncrement() - 1); //每次切出一个词,就将position加上这个词的长度
// 如果指定了Field的词条向量的偏移量,则存储该此条向量
if(field->isStoreOffsetWithTermVector()){
TermVectorOffsetInfo tio;
tio.setStartOffset(offset + t.startOffset());
tio.setEndOffset(offset + t.endOffset());
addPosition(fieldName, t.termText(), position++, &tio);
}
else
{
addPosition(fieldName, t.termText(), position++, NULL);
}
lastTokenEndOffset = t.endOffset();
length++;
// Apply field truncation policy.
// length:切出的字段的长度
if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
if ( length > maxFieldLength) { // 如果当前切出的词条数已经达到了该Field的最大长度
break;
}
} else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
const TCHAR* errMsgBase =
_T("Indexing a huge number of tokens from a single")
_T(" field (/"%s/", in this case) can cause CLucene")
_T(" to use memory excessively.")
_T(" By default, CLucene will accept only %s tokens")
_T(" tokens from a single field before forcing the")
_T(" client programmer to specify a threshold at")
_T(" which to truncate the token stream.")
_T(" You should set this threshold via")
_T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
_T(" to disable truncation, or a value to specify maximum number of fields).");
TCHAR defaultMaxAsChar[34];
_i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,defaultMaxAsChar, 10);
int32_t errMsgLen = _tcslen(errMsgBase)+ _tcslen(fieldName)+ _tcslen(defaultMaxAsChar);
TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);
_sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);
_CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
}
} // while token->next
if(lastTokenEndOffset != -1 )
{
offset += lastTokenEndOffset + 1;
}
} _CLFINALLY (
stream->close();
_CLDELETE(stream);
);
} _CLFINALLY (
if (delReader) {
_CLDELETE(reader);
}
);
} // if/else field is to be tokenized
// 位置信息,偏移量信息,长度信息
fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field->getBoost();
fieldOffsets[fieldNumber] = offset; //实际上是这个字段的终止偏移位置
} // if field is to beindexed
} // while more fields available
} _CLFINALLY (
_CLDELETE(fields);
);
}
(7). void DocumentWriter::addPosition()方法
void DocumentWriter::addPosition(const TCHAR* field,const TCHAR* text,const int32_t position,TermVectorOffsetInfo* offset)
{
//设置词条
//typedef CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare, Term::Equals> PostingTableType;
termBuffer->set(field,text,false);
Posting* ti = postingTable.get(termBuffer);
if (ti != NULL)
{
int32_t freq = ti->freq;
if (ti->positions.length == freq) {
// positions array is full, realloc its size
// 扩充数组:初始添加时频率为positions.values[0] = position; positions.length = 1;
ti->positions.length = freq*2;
ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));
}
ti->positions.values[freq] = position; // add new position
if (offset != NULL)
{
if (ti->offsets.length == freq)
{
//存储偏移量信息时跟存储位置采用相同的方法
ti->offsets.length = freq*2;
ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));
}
ti->offsets[freq] = *offset;
}
ti->freq = freq + 1; // 更新词条频率
} else { // word not seen before
Term* term = _CLNEW Term( field, text, false);
postingTable.put(term, _CLNEW Posting(term, position, offset));
}
}
(8). DocumentWriter::Posting::Posting()构造函数
/*Posting构造函数*/
DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset)
{
//对新生成的词条的处理
freq = 1; //频率设置为
term = _CL_POINTER(t);
positions.values = (int32_t*)malloc(sizeof(int32_t)); //存储的位置数组
positions.values[0] = position;
positions.length = 1; //设置数组容量也是
if ( offset != NULL )
{
this->offsets.values = (TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));
this->offsets.values[0] = *offset; //设置其中一个偏移量信息
this->offsets.length = 1; //设置数组容量也是
}
}
(9). DocumentWriter::writePostings()方法
void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment)
{
#define __DOCLOSE(obj) if(obj!=NULL){ try{ obj->close(); _CLDELETE(obj);} catch(CLuceneError &e){ierr=e.number();err=e.what();} catch(...){err="Unknown error while closing posting tables";} }
IndexOutput* freq = NULL;
IndexOutput* prox = NULL;
TermInfosWriter* tis = NULL;
TermVectorsWriter* termVectorWriter = NULL;
try {
//open files for inverse index storage
//.frq: 频率信息文件
const char* buf = Misc::segmentname( segment, ".frq");
freq = directory->createOutput( buf );
_CLDELETE_CaARRAY( buf );
//.prx: 位置信息文件
buf = Misc::segmentname( segment, ".prx");
prox = directory->createOutput( buf );
_CLDELETE_CaARRAY( buf );
//TermInfosWriter类的构造函数,termIndexInterval:词条分组间隔
tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,termIndexInterval);
TermInfo* ti = _CLNEW TermInfo();
const TCHAR* currentField = NULL;
for (int32_t i = 0; i < postingsLength; i++) {
Posting* posting = postings[i];
// 写入字典文件以及快表文件
ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
tis->add(posting->term, ti);
int32_t postingFreq = posting->freq;
if (postingFreq == 1) // optimize freq=1
freq->writeVInt(1); // set low bit of doc num.
else {
freq->writeVInt(0); // the document number
freq->writeVInt(postingFreq); // frequency in doc
}
int32_t lastPosition = 0; // write positions
//使用差别法写入位置信息
for (int32_t j = 0; j < postingFreq; ++j) { // use delta-encoding
prox->writeVInt(posting->positions.values[j] - lastPosition);
lastPosition = posting->positions.values[j];
}
// check to see if we switched to a new field
const TCHAR* termField = posting->term->field();
//对字段包含的词条的处理
//对不同字段的处理
if (currentField==NULL||_tcscmp(currentField,termField)!= 0) { //todo, can we do an intern'd check?
// changing field - see if there is something to save
currentField = termField;
FieldInfo* fi = fieldInfos->fieldInfo(currentField);
//在field中以StoreTermVector方式保存的posting信息需要TermVectorsWriter类来写入
if (fi->storeTermVector)
{
if (termVectorWriter == NULL) {
//TermVectorsWriter类的构造函数
termVectorWriter =_CLNEW TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter->openDocument();
}
termVectorWriter->openField(currentField);
} else if (termVectorWriter != NULL) {
termVectorWriter->closeField();
}
}
if (termVectorWriter != NULL && termVectorWriter->isFieldOpen())
{
termVectorWriter->addTerm(posting->term->text(), postingFreq, &posting->positions, &posting->offsets);
}
}
if (termVectorWriter != NULL)
{
termVectorWriter->closeDocument();
}
_CLDELETE(ti);
}_CLFINALLY (
const char* err=NULL;
int32_t ierr=0;
__DOCLOSE(freq);
__DOCLOSE(prox);
__DOCLOSE(tis);
__DOCLOSE(termVectorWriter);
if ( err != NULL )
_CLTHROWA(ierr,err);
);
}
(10). TermInfosWriter::TermInfosWriter()构造函数
TermInfosWriter::TermInfosWriter(Directory* directory, const char* segment, FieldInfos* fis, int32_t interval):
fieldInfos(fis)
{
CND_PRECONDITION(segment != NULL, "segment is NULL");
initialise(directory,segment,interval, false);
//这个other会填写.tii文件信息
other = _CLNEW TermInfosWriter(directory, segment,fieldInfos, interval, true);
CND_CONDITION(other != NULL, "other is NULL");
other->other = this;
}
(11). void TermInfosWriter::initialise()方法
void TermInfosWriter::initialise(Directory* directory, const char* segment, int32_t interval, bool IsIndex)
{
//字典文件由term信息组成,.tis 文件表示term信息文件.tii文件代表快表文件
//对.tis文件中term个数计数,每到一个分组跨度(比如计数到,256),便把分组信息点term信息保存到.tii文件中
lastTerm = _CLNEW Term;
CND_CONDITION(lastTerm != NULL, "Could not allocate memory for lastTerm");
lastTi = _CLNEW TermInfo();
CND_CONDITION(lastTi != NULL, "Could not allocate memory for lastTi");
lastIndexPointer = 0;
size = 0;
isIndex = IsIndex;
indexInterval = interval;
skipInterval = LUCENE_DEFAULT_TERMDOCS_SKIP_INTERVAL;
//other: isIndex=true 本身自己是false
const char* buf = Misc::segmentname(segment, (isIndex ? ".tii" : ".tis"));
output = directory->createOutput(buf);
_CLDELETE_CaARRAY(buf);
output->writeInt(FORMAT); // write format
output->writeLong(0); // leave space for size
output->writeInt(indexInterval);// write indexInterval
output->writeInt(skipInterval); // write skipInterval
//Set other to NULL by Default
other = NULL;
}
(12). void TermInfosWriter::add()方法
void TermInfosWriter::add(Term* term, const TermInfo* ti)
{
CND_PRECONDITION(isIndex || (!isIndex && term->compareTo(lastTerm) > 0),"term out of order");
CND_PRECONDITION(ti->freqPointer >= lastTi->freqPointer,"freqPointer out of order");
CND_PRECONDITION(ti->proxPointer >= lastTi->proxPointer,"proxPointer out of order");
if (!isIndex && size % indexInterval == 0){
//本身是isIndex=false 然后达到了词条分组间隔
other->add(lastTerm, lastTi);
}
//写入词条
writeTerm(term);
// write doc freq 写入文档频率
output->writeVInt(ti->docFreq);
//write pointers 写入词条频率差值,位置信息差值,初始时都是
output->writeVLong(ti->freqPointer - lastTi->freqPointer);
output->writeVLong(ti->proxPointer - lastTi->proxPointer);
if (ti->docFreq >= skipInterval)
{
output->writeVInt(ti->skipOffset);
}
//对other 快表的写入处理
if (isIndex)
{
output->writeVLong(other->output->getFilePointer() - lastIndexPointer);
lastIndexPointer = other->output->getFilePointer(); // write pointer
}
lastTi->set(ti); //设置上一次TermInfo* ti信息
size++;
}
(13). void TermInfosWriter::writeTerm()方法
void TermInfosWriter::writeTerm(Term* term)
{
//因为词条信息已经是排序好了的,这里使用差别法写入词条信息
int32_t start = Misc::stringDifference(lastTerm->text(),lastTerm->textLength(), term->text(),term->textLength());
int32_t length = term->textLength() - start;
output->writeVInt(start); // 写入共有前缀字符长度
output->writeVInt(length); // 写入不同的字符长度
output->writeChars(term->text(), start, length); //写入不同的字符值
int32_t fieldnum = fieldInfos->fieldNumber(term->field()); //写入词条所在字段的编号
CND_PRECONDITION(fieldnum>=-1&&fieldnum<fieldInfos->size(),"Fieldnum is out of range");
output->writeVInt(fieldnum); // write field num
if ( lastTerm->__cl_refcount == 1 ){
lastTerm->set(term,term->text());
}else{
_CLDECDELETE(lastTerm);
lastTerm = _CL_POINTER(term);
}
}
(14). TermVectorsWriter::TermVectorsWriter()构造函数
TermVectorsWriter::TermVectorsWriter(CL_NS(store)::Directory* directory,
const char* segment,FieldInfos* fieldInfos)
{
//.tvx: 保存了指针信息,指针指向.tvd的document数据位置
//.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息
//.tvf: 保存field中term,频率,位置与偏移信息
char fbuf[CL_MAX_NAME];
strcpy(fbuf,segment);
char* fpbuf=fbuf+strlen(fbuf);
strcpy(fpbuf,LUCENE_TVX_EXTENSION);
tvx = directory->createOutput(fbuf);
tvx->writeInt(FORMAT_VERSION);
strcpy(fpbuf,LUCENE_TVD_EXTENSION);
tvd = directory->createOutput(fbuf);
tvd->writeInt(FORMAT_VERSION);
strcpy(fpbuf,LUCENE_TVF_EXTENSION);
tvf = directory->createOutput(fbuf);
tvf->writeInt(FORMAT_VERSION);
this->fieldInfos = fieldInfos;
currentField = NULL; //字段是否打开的判断
currentDocPointer = -1; //文档是否打开的判断
}
(15). void TermVectorsWriter::writeField()方法
void TermVectorsWriter::writeField()
{
//.tvx: 保存了指针信息,指针指向.tvd的document数据位置
//.tvd: 保存表指针,表内的指针指向.tvf文件中的field信息
//.tvf: 保存field中term,频率,位置与偏移信息
currentField->tvfPointer = tvf->getFilePointer();
//System.out.println("Field Pointer: " + currentField.tvfPointer);
//写入词条个数
int32_t size = terms.size();
tvf->writeVInt(size);
//是否以TermVector方式保存位置信息,是否以TermVector方式保存偏移量信息
bool storePositions = currentField->storePositions;
bool storeOffsets = currentField->storeOffsets;
uint8_t bits = 0x0;
if (storePositions)
bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (storeOffsets)
bits |= STORE_OFFSET_WITH_TERMVECTOR;
tvf->writeByte(bits);
const TCHAR* lastTermText = LUCENE_BLANK_STRING; //一个空串""
int32_t lastTermTextLen = 0;
for (int32_t i = 0; i < size; ++i)
{
TVTerm* term = terms[i];
int32_t start = CL_NS(util)::Misc::stringDifference(lastTermText, lastTermTextLen, term->getTermText(),term->getTermTextLen());
int32_t length = term->getTermTextLen() - start;
tvf->writeVInt(start); // 写入共有前缀字符长度
tvf->writeVInt(length); // 写入不同的字符长度
tvf->writeChars(term->getTermText(), start, length); //写入不同的字符值
tvf->writeVInt(term->freq); //写入词条的频率
lastTermText = term->getTermText();
lastTermTextLen = term->getTermTextLen();
//位置信息与偏移量的差别在于:位置信息保存的是term之间相隔term的个数,偏移量保存
//term之间相隔的字符数
if(storePositions){
//以TermVector方式保存位置信息
if(term->positions == NULL)
{
_CLTHROWA(CL_ERR_IllegalState, "Trying to write positions that are NULL!");
}
// use delta encoding for positions
int32_t position = 0;
for (int32_t j = 0; j < term->freq; ++j){
tvf->writeVInt((*term->positions)[j] - position); //只保存位置差值
position = (*term->positions)[j];
}
}
if(storeOffsets){
//以TermVector方式保存偏移量信息
if(term->offsets == NULL)
{
_CLTHROWA(CL_ERR_IllegalState, "Trying to write offsets that are NULL!");
}
int32_t position = 0;
for (int32_t j = 0; j < term->freq; ++j) {
tvf->writeVInt((*term->offsets)[j].getStartOffset() - position);
tvf->writeVInt((*term->offsets)[j].getEndOffset() - (*term->offsets)[j].getStartOffset()); //Save the diff between the two.
position = (*term->offsets)[j].getEndOffset();
}
}
}
}
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/bingfox/archive/2010/07/19/5745363.aspx