当前位置: 首页 > 工具软件 > PyLucene > 使用案例 >

pylucene构建索引_PyLucene-学习-01安装及建索引、查询

金昌胤
2023-12-01

PyLucene是Java版Lucene的Python版封装。这个工具的目标是让Python使用Lucene的文本索引和搜索能力。它与Java版Lucene的最新版本是兼容的。PyLucene把一个带有JAVA VM的Lucene嵌入到Python进程中。你可以在http://lucene.apache.org/pylucene/网站上找到更多的PyLucene详情。

本文中,我们将描述如何使用PyLucene构建搜索索引和查询一个搜索索引。你可以从先前的文档看到Lucene3.0安装说明。

PyLucene-Win32下的安装包可以从下面的网址中找到:

http://code.google.com/a/apache-extras.org/p/pylucene-extra/downloads/list

注:使用PyLucene必须安装Java SDK

一、 使用PyLucene创建索引

使用下面的代码基于PyLucene来创建索引

#!/usr/bin/env python

import os,sys,glob

import lucene

from lucene import SimpleFSDirectory, System, File, Document, Field, \

StandardAnalyzer, IndexWriter, Version

"""

Example of Indexing with PyLucene 3.0

"""

def luceneIndexer(docdir,indir):

"""

IndexDocuments from a directory

"""

lucene.initVM()

DIRTOINDEX= docdir

INDEXIDR= indir

indexdir= SimpleFSDirectory(File(INDEXIDR))

analyzer= StandardAnalyzer(Version.LUCENE_30)

index_writer= IndexWriter(indexdir,analyzer,True,\

IndexWriter.MaxFieldLength(512))

for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):

print"Indexing: ", tfile

document= Document()

content= open(tfile,'r').read()

document.add(Field("text",content,Field.Store.YES,\

Field.Index.ANALYZED))

index_writer.addDocument(document)

print"Done: ", tfile

index_writer.optimize()

printindex_writer.numDocs()

index_writer.close()

你必须提供两个参数给luceneIndexer()函数。

1)  一个保存被索引文档的目录路径;

2)  一个索引存储的目录路径。

二、使用Pylucene查询

下面的代码用于查询Pylucene创建的索引。

#!/usr/bin/env python

import sys

import lucene

from lucene import SimpleFSDirectory, System, File, Document, Field,\

StandardAnalyzer, IndexSearcher, Version,QueryParser

"""

PyLucene retriver simple example

"""

INDEXDIR = "./MyIndex"

def luceneRetriver(query):

lucene.initVM()

indir= SimpleFSDirectory(File(INDEXDIR))

lucene_analyzer= StandardAnalyzer(Version.LUCENE_30)

lucene_searcher= IndexSearcher(indir)

my_query= QueryParser(Version.LUCENE_30,"text",\

lucene_analyzer).parse(query)

MAX= 1000

total_hits =lucene_searcher.search(my_query,MAX)

print"Hits: ",total_hits.totalHits

forhit in total_hits.scoreDocs:

print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString()

doc= lucene_searcher.doc(hit.doc)

printdoc.get("text").encode("utf-8")

luceneRetriver("really coolrestaurant")

在代码中,我们认为的指定索引目录为INDEXDIR=./MyIndex,你也可以使用命令行参数(sys.argv)来接收索引目录来替换它。

当使用函数luceneRetriver()时,你必须给一个查询作为参数

=====================================================

另外还有一个实例:

PyLucene Samples目录下的IndexFiles.py和SearchFiles.py完成了对指定目录下的.txt文件内容进行索引,我们可以修改两个文件实现上面的功能。另外为了能够检索简体中文、繁体中文文件名、目录名对文件名和目录名进行了Unicode编码。源码如下:

IndexFiles.py

# -*- coding:GB2312 -*-

import sys, os, PyLucene, threading, time

from datetime import datetime

"""

This class is loosely based on the Lucene (java implementation) demo class

org.apache.lucene.demo.IndexFiles. It will take a directory as an argument

and will index all of the files in that directory and downward recursively.

It will index on the file path, the file name and the file contents. The

resulting Lucene index will be placed in the current directory and called

'index'.

"""

class Ticker(object):

def __init__(self):

self.tick = True

def run(self):

while self.tick:

sys.stdout.write('.')

sys.stdout.flush()

time.sleep(1.0)

class IndexFiles(object):

"""Usage: python IndexFiles """

def __init__(self, root, storeDir, analyzer):

if not os.path.exists(storeDir):

os.mkdir(storeDir)

store = PyLucene.FSDirectory.getDirectory(storeDir, False)

writer = PyLucene.IndexWriter(store, analyzer, False)

writer.setMaxFieldLength(1048576)

self.indexDocs(root, writer)

ticker = Ticker()

print 'optimizing index',

threading.Thread(target=ticker.run).start()

writer.optimize()

writer.close()

ticker.tick = False

print 'done'

def indexDocs(self, root, writer):

for root, dirnames, filenames in os.walk(root):

print root

try:

sroot = unicode(root, 'GBK')

print sroot

except:

print "*****************************unicode error"

print root

continue

#add dir

doc = PyLucene.Document()

doc.add(PyLucene.Field("path", sroot,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.UN_TOKENIZED))

doc.add(PyLucene.Field("name", sroot,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.TOKENIZED))

writer.addDocument(doc)

for filename in filenames:

try:

filename = unicode(filename, 'GBK')

except:

print "*****************************unicode error"

print filename

continue

print "adding", filename

try:

#path = unicode(root, 'GB2312')#

#

path =os.path.join(sroot, filename)

#file = open(path)

#contents = unicode(file.read(), 'iso-8859-1')

#contents = unicode(file.read(), 'GBK')

#file.close()

doc = PyLucene.Document()

doc.add(PyLucene.Field("path", path,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.UN_TOKENIZED))

doc.add(PyLucene.Field("name", filename,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.TOKENIZED))

'''

if len(contents) > 0:

doc.add(PyLucene.Field("contents", contents,

PyLucene.Field.Store.YES,

PyLucene.Field.Index.TOKENIZED))

else:

print "warning: no content in %s" % filename

'''

writer.addDocument(doc)

except Exception, e:

print "Failed in indexDocs:", e

__debug = 0

if __name__ == '__main__':

if __debug != 1:

if len(sys.argv) < 2:

print IndexFiles.__doc__

sys.exit(1)

print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION

start = datetime.now()

try:

if __debug != 1:

IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer())

else:

IndexFiles(r'c:/testccc', "index", PyLucene.StandardAnalyzer())

end = datetime.now()

print end - start

except Exception, e:

print "Failed: ", e

SearchFiles.py

from PyLucene import QueryParser, IndexSearcher, StandardAnalyzer, FSDirectory

from PyLucene import VERSION, LUCENE_VERSION

"""

This script is loosely based on the Lucene (java implementation) demo class

org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it

will search the Lucene index in the current directory called 'index' for the

search query entered against the 'contents' field. It will then display the

'path' and 'name' fields for each of the hits it finds in the index. Note that

search.close() is currently commented out because it causes a stack overflow in

some cases.

"""

def run(searcher, analyzer):

while True:

print

print "Hit enter with no input to quit."

command = raw_input("Query:")

command = unicode(command, 'GBK')

if command == '':

return

print

print "Searching for:", command

#query = QueryParser("contents", analyzer).parse(command)

query = QueryParser("name", analyzer).parse(command)

hits = searcher.search(query)

print "%s total matching documents." % hits.length()

for i, doc in hits:

print 'path:', doc.get("path"), 'name:', doc.get("name")

if __name__ == '__main__':

STORE_DIR = "index"

print 'PyLucene', VERSION, 'Lucene', LUCENE_VERSION

directory = FSDirectory.getDirectory(STORE_DIR, False)

searcher = IndexSearcher(directory)

analyzer = StandardAnalyzer()

run(searcher, analyzer)

searcher.close()

建立索引,运行:

python IndexFiles.py c:/

查找的时候,运行:

python SearchFiles.py

如果只查找一个关键词则直接输入;如果想同时查找两个关键词,如Python 网络,则输入:Python AND 网络;如果想查找Python或网络则:Python 网络,也可以Python OR 网络。

 类似资料: