PyLucene是Java版Lucene的Python版封装。这个工具的目标是让Python使用Lucene的文本索引和搜索能力。它与Java版Lucene的最新版本是兼容的。PyLucene把一个带有JAVA VM的Lucene嵌入到Python进程中。你可以在http://lucene.apache.org/pylucene/网站上找到更多的PyLucene详情。
本文中,我们将描述如何使用PyLucene构建搜索索引和查询一个搜索索引。你可以从先前的文档看到Lucene3.0安装说明。
PyLucene-Win32下的安装包可以从下面的网址中找到:
http://code.google.com/a/apache-extras.org/p/pylucene-extra/downloads/list
注:使用PyLucene必须安装Java SDK
一、 使用PyLucene创建索引
使用下面的代码基于PyLucene来创建索引
#!/usr/bin/env python
import os,sys,glob
import lucene
from lucene import SimpleFSDirectory, System, File, Document, Field, \
StandardAnalyzer, IndexWriter, Version
"""
Example of Indexing with PyLucene 3.0
"""
def luceneIndexer(docdir,indir):
"""
IndexDocuments from a directory
"""
lucene.initVM()
DIRTOINDEX= docdir
INDEXIDR= indir
indexdir= SimpleFSDirectory(File(INDEXIDR))
analyzer= StandardAnalyzer(Version.LUCENE_30)
index_writer= IndexWriter(indexdir,analyzer,True,\
IndexWriter.MaxFieldLength(512))
for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
print"Indexing: ", tfile
document= Document()
content= open(tfile,'r').read()
document.add(Field("text",content,Field.Store.YES,\
Field.Index.ANALYZED))
index_writer.addDocument(document)
print"Done: ", tfile
index_writer.optimize()
printindex_writer.numDocs()
index_writer.close()
你必须提供两个参数给luceneIndexer()函数。
1) 一个保存被索引文档的目录路径;
2) 一个索引存储的目录路径。
二、使用Pylucene查询
下面的代码用于查询Pylucene创建的索引。
#!/usr/bin/env python
import sys
import lucene
from lucene import SimpleFSDirectory, System, File, Document, Field,\
StandardAnalyzer, IndexSearcher, Version,QueryParser
"""
PyLucene retriver simple example
"""
INDEXDIR = "./MyIndex"
def luceneRetriver(query):
lucene.initVM()
indir= SimpleFSDirectory(File(INDEXDIR))
lucene_analyzer= StandardAnalyzer(Version.LUCENE_30)
lucene_searcher= IndexSearcher(indir)
my_query= QueryParser(Version.LUCENE_30,"text",\
lucene_analyzer).parse(query)
MAX= 1000
total_hits =lucene_searcher.search(my_query,MAX)
print"Hits: ",total_hits.totalHits
forhit in total_hits.scoreDocs:
print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString()
doc= lucene_searcher.doc(hit.doc)
printdoc.get("text").encode("utf-8")
luceneRetriver("really coolrestaurant")
在代码中,我们认为的指定索引目录为INDEXDIR=./MyIndex,你也可以使用命令行参数(sys.argv)来接收索引目录来替换它。
当使用函数luceneRetriver()时,你必须给一个查询作为参数
=====================================================
另外还有一个实例:
PyLucene Samples目录下的IndexFiles.py和SearchFiles.py完成了对指定目录下的.txt文件内容进行索引,我们可以修改两个文件实现上面的功能。另外为了能够检索简体中文、繁体中文文件名、目录名对文件名和目录名进行了Unicode编码。源码如下:
IndexFiles.py
# -*- coding:GB2312 -*-
import sys, os, PyLucene, threading, time
from datetime import datetime
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__(self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(1.0)
class IndexFiles(object):
"""Usage: python IndexFiles """
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = PyLucene.FSDirectory.getDirectory(storeDir, False)
writer = PyLucene.IndexWriter(store, analyzer, False)
writer.setMaxFieldLength(1048576)
self.indexDocs(root, writer)
ticker = Ticker()
print 'optimizing index',
threading.Thread(target=ticker.run).start()
writer.optimize()
writer.close()
ticker.tick = False
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
print root
try:
sroot = unicode(root, 'GBK')
print sroot
except:
print "*****************************unicode error"
print root
continue
#add dir
doc = PyLucene.Document()
doc.add(PyLucene.Field("path", sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("name", sroot,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
writer.addDocument(doc)
for filename in filenames:
try:
filename = unicode(filename, 'GBK')
except:
print "*****************************unicode error"
print filename
continue
print "adding", filename
try:
#path = unicode(root, 'GB2312')#
#
path =os.path.join(sroot, filename)
#file = open(path)
#contents = unicode(file.read(), 'iso-8859-1')
#contents = unicode(file.read(), 'GBK')
#file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field("path", path,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.UN_TOKENIZED))
doc.add(PyLucene.Field("name", filename,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
'''
if len(contents) > 0:
doc.add(PyLucene.Field("contents", contents,
PyLucene.Field.Store.YES,
PyLucene.Field.Index.TOKENIZED))
else:
print "warning: no content in %s" % filename
'''
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
__debug = 0
if __name__ == '__main__':
if __debug != 1:
if len(sys.argv) < 2:
print IndexFiles.__doc__
sys.exit(1)
print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
start = datetime.now()
try:
if __debug != 1:
IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer())
else:
IndexFiles(r'c:/testccc', "index", PyLucene.StandardAnalyzer())
end = datetime.now()
print end - start
except Exception, e:
print "Failed: ", e
SearchFiles.py
from PyLucene import QueryParser, IndexSearcher, StandardAnalyzer, FSDirectory
from PyLucene import VERSION, LUCENE_VERSION
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer):
while True:
print "Hit enter with no input to quit."
command = raw_input("Query:")
command = unicode(command, 'GBK')
if command == '':
return
print "Searching for:", command
#query = QueryParser("contents", analyzer).parse(command)
query = QueryParser("name", analyzer).parse(command)
hits = searcher.search(query)
print "%s total matching documents." % hits.length()
for i, doc in hits:
print 'path:', doc.get("path"), 'name:', doc.get("name")
if __name__ == '__main__':
STORE_DIR = "index"
print 'PyLucene', VERSION, 'Lucene', LUCENE_VERSION
directory = FSDirectory.getDirectory(STORE_DIR, False)
searcher = IndexSearcher(directory)
analyzer = StandardAnalyzer()
run(searcher, analyzer)
searcher.close()
建立索引,运行:
python IndexFiles.py c:/
查找的时候,运行:
python SearchFiles.py
如果只查找一个关键词则直接输入;如果想同时查找两个关键词,如Python 网络,则输入:Python AND 网络;如果想查找Python或网络则:Python 网络,也可以Python OR 网络。