#coding:utf-8
'''对doc目录里的所有文件建立索引,索引域主要有name,path,contents'''
importsys, osimportlucenefrom lucene importSimpleFSDirectory,Document,File, Field,\
StandardAnalyzer, IndexWriter, Versionfrom datetime importdatetime
lucene.initVM()############1
print 'lucene',lucene.VERSION
start=datetime.now()
indexDir= './index'docDir= './doc'
try:
analyzer= StandardAnalyzer(Version.LUCENE_CURRENT) ###########2
INDEXDIR = SimpleFSDirectory(File(indexDir)) ############3
indexWriter = IndexWriter(INDEXDIR, analyzer, True,IndexWriter.MaxFieldLength.LIMITED)#####4
for root, dirnames, filenames inos.walk(docDir):for filename infilenames:printfilenameif not filename.endswith('.txt'):continuepath=os.path.join(root,filename)
path=os.path.abspath(os.path.normpath(path))
with open(path,'r') as c:
contents= unicode(c.read(),'utf-8')#print contents
doc= Document()###5
nameField = Field('name', filename, Field.Store.YES, Field.Index.NOT_ANALYZED)
doc.add(nameField)###6
pathField = Field('path', path, Field.Store.YES, Field.Index.NOT_ANALYZED)
doc.add(pathField)
contentsField= Field('contents', contents, Field.Store.NO, Field.Index.ANALYZED)
doc.add(contentsField)
indexWriter.addDocument(doc, analyzer)####6
indexWriter.optimize()#######7
indexWriter.close()######8
end =datetime.now()print '建立索引花费时间:', (end-start)exceptException,e:printe#下面的内容为解释
'''Field.Store.YES:存储字段值(未分词前的字段值)
Field.Store.NO:不存储,存储与索引没有关系
Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损
Field.Index.ANALYZED:分词建索引
Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
Field.Index.NOT_ANALYZED:不分词且索引
Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存'''
'''流程:
1 initVM()
2 StandardAnalyzer
3 SimpleFSDirectory
4 IndexWriter
5 for
doc = Document()
doc.add(Field())
indexWriter.addDocument(doc, analyzer)
6 indexWriter.optimize()
indexWriter.close()'''