pylucene构建索引_PyLucene索引DEMO

许法

2023-12-01

#coding:utf-8

'''对doc目录里的所有文件建立索引，索引域主要有name，path，contents'''

importsys, osimportlucenefrom lucene importSimpleFSDirectory,Document,File, Field,\

StandardAnalyzer, IndexWriter, Versionfrom datetime importdatetime

lucene.initVM()############1

print 'lucene',lucene.VERSION

start=datetime.now()

indexDir= './index'docDir= './doc'

try:

analyzer= StandardAnalyzer(Version.LUCENE_CURRENT) ###########2

INDEXDIR = SimpleFSDirectory(File(indexDir)) ############3

indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4

for root, dirnames, filenames inos.walk(docDir):for filename infilenames:printfilenameif not filename.endswith('.txt'):continuepath=os.path.join(root,filename)

path=os.path.abspath(os.path.normpath(path))

with open(path,'r') as c:

contents= unicode(c.read(),'utf-8')#print contents

doc= Document()###5

nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED)

doc.add(nameField)###6

pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED)

doc.add(pathField)

contentsField= Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED)

doc.add(contentsField)

indexWriter.addDocument(doc, analyzer)####6

indexWriter.optimize()#######7

indexWriter.close()######8

end =datetime.now()print '建立索引花费时间：', (end-start)exceptException,e:printe#下面的内容为解释

'''Field.Store.YES:存储字段值(未分词前的字段值)

Field.Store.NO:不存储,存储与索引没有关系

Field.Store.COMPRESS:压缩存储,用于长文本或二进制，但性能受损

Field.Index.ANALYZED:分词建索引

Field.Index.ANALYZED_NO_NORMS:分词建索引，但是Field的值不像通常那样被保存，而是只取一个byte，这样节约存储空间

Field.Index.NOT_ANALYZED:不分词且索引

Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引，Field的值去一个byte保存'''

'''流程：

1 initVM()

2 StandardAnalyzer

3 SimpleFSDirectory

4 IndexWriter

5 for

doc = Document()

doc.add(Field())

indexWriter.addDocument(doc, analyzer)

6 indexWriter.optimize()

indexWriter.close()'''

pylucene构建索引_PyLucene索引DEMO

相关阅读

相关文章

相关问答

相关文档