当前位置: 首页 > 工具软件 > PyLucene > 使用案例 >

pylucene构建索引_PyLucene索引DEMO

许法
2023-12-01

#coding:utf-8

'''对doc目录里的所有文件建立索引,索引域主要有name,path,contents'''

importsys, osimportlucenefrom lucene importSimpleFSDirectory,Document,File, Field,\

StandardAnalyzer, IndexWriter, Versionfrom datetime importdatetime

lucene.initVM()############1

print 'lucene',lucene.VERSION

start=datetime.now()

indexDir= './index'docDir= './doc'

try:

analyzer= StandardAnalyzer(Version.LUCENE_CURRENT) ###########2

INDEXDIR = SimpleFSDirectory(File(indexDir)) ############3

indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4

for root, dirnames, filenames inos.walk(docDir):for filename infilenames:printfilenameif not filename.endswith('.txt'):continuepath=os.path.join(root,filename)

path=os.path.abspath(os.path.normpath(path))

with open(path,'r') as c:

contents= unicode(c.read(),'utf-8')#print contents

doc= Document()###5

nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED)

doc.add(nameField)###6

pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED)

doc.add(pathField)

contentsField= Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED)

doc.add(contentsField)

indexWriter.addDocument(doc, analyzer)####6

indexWriter.optimize()#######7

indexWriter.close()######8

end =datetime.now()print '建立索引花费时间:', (end-start)exceptException,e:printe#下面的内容为解释

'''Field.Store.YES:存储字段值(未分词前的字段值)

Field.Store.NO:不存储,存储与索引没有关系

Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损

Field.Index.ANALYZED:分词建索引

Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间

Field.Index.NOT_ANALYZED:不分词且索引

Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存'''

'''流程:

1 initVM()

2 StandardAnalyzer

3 SimpleFSDirectory

4 IndexWriter

5 for

doc = Document()

doc.add(Field())

indexWriter.addDocument(doc, analyzer)

6 indexWriter.optimize()

indexWriter.close()'''

 类似资料: