前2天,要预研一下clucene, 准备和java程序对接(clucene来写, java-lucene来读[查询]).
通过实验和查资料,clucene-core-2.3.3.4产生的数据无法被新版的java-lucene6.x和java-lucene7.x读取。可能是格式不兼容。
clucene虽然版本和java-lucene不兼容,也有自己的用场(比自己做索引查询强多了).
如果java程序不介意用低版的lucene(能和clucene-core-2.3.3.4兼容的那种)或同意使用C封装的jniDLL来进行查询, clucene-core-2.3.3.4还是有用场的.
在查资料的过程中,发现能用的clucene编程资料特别的少。
官方文档只给出一些API接口说明,没给具体的API用法。
官方工程自带的cl_demo中写的那点代码不实用(没写文件, demo自己写,自己查询时查不到), 失望.
非官方的clucene编程资料,大多都是基于clucene-core-2.3.3.4之前的版本, 参考价值很小。
最终,我是在clucene-core-2.3.3.4自带的测试用例中扒出来的代码.
如果没有官方代码做参照,谁知道怎么玩:)
开源是好,在工程中可以各种找参考代码,只是要用一些时间来读代码和做实验。
开源工程的测试用例工程做的特别好,各种API的使用情况都测试到了,我们用的场景基本是测试用例中的一种或几种的组合.
安装clucene库和测试clucene用法
debian8.8(安装了make, gcc, g++)
zlib-1.2.11.tar.gz
cmake-3.12.3.tar.gz
clucene-core-2.3.3.4.tar.gz
自产测试工程 test_clucene\src\case1
切到root用户
安装cmake
tar -xzvf ./cmake-3.12.3.tar.gz
cd cmake-3.12.3/
./configure
make
make install
安装zlib
tar -xzvf ./zlib-1.2.11.tar.gz
cd zlib-1.2.11/
cmake ./CMakeLists.txt
make
make install
安装clucene
tar -xzvf ./clucene-core-2.3.3.4.tar.gz
cd clucene-core-2.3.3.4
cmake ./CMakeLists.txt
make
make install
cd clucene-core-2.3.3.4
make cl_test
编译好的程序为./bin/cl_test
测试工程为./src/test/
如果有需求,就在./src/test/中找参考代码吧:)
MAKE_FILE_MACRO__BIN_NAME = [test_clucene]
>> fn_test()
>> write_to_clucene()
create file system writer
close file system writer
================================================================================
>> query_from_clucene(k1, *)
================================================================================
lucene_hit cnt = 3
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<k1:v1 is the value, k1 is the key>>
0. query = *, key = k1, value = v1 is the value, k1 is the key, score = 1.000000
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<k1:v1_1>>
1. query = *, key = k1, value = v1_1, score = 0.042404
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<k1:v1_2>>
2. query = *, key = k1, value = v1_2, score = 0.042404
================================================================================
>> query_from_clucene(url, *bing*)
================================================================================
lucene_hit cnt = 1
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<url:http://cn.bing.com>>
0. query = *bing*, key = url, value = http://cn.bing.com, score = 1.000000
================================================================================
>> query_from_clucene(url, csdn.)
================================================================================
lucene_hit cnt = 0
================================================================================
>> query_from_clucene(url, csdn)
================================================================================
lucene_hit cnt = 3
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<url:https://me.csdn.net>>
0. query = csdn, key = url, value = https://me.csdn.net, score = 0.846574
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<url:http://www.csdn.net/>>
1. query = csdn, key = url, value = http://www.csdn.net/, score = 0.846574
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<url:https://blog.csdn.net>>
2. query = csdn, key = url, value = https://blog.csdn.net, score = 0.846574
================================================================================
>> query_from_clucene(url, b??g)
================================================================================
lucene_hit cnt = 2
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<url:https://blog.csdn.net>>
0. query = b??g, key = url, value = https://blog.csdn.net, score = 0.421841
--------------------------------------------------------------------------------
lucene_doc->toString() = Document< stored/uncompressed,indexed,tokenized<url:http://cn.bing.com>>
1. query = b??g, key = url, value = http://cn.bing.com, score = 0.421841
THE END
// @file main.cpp
// @brief 测试clucene文件的读写, 确定自己写进入clucene的key-value可以自己读出来
// @ref http://clucene.sourceforge.net/doc/html/
// @note 实验环境 clucene-core-2.3.3.4
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include "CLucene.h" // clucene-core-2.3.3.4 只需要包含总的头文件
#include "config/repl_tchar.h" // for linux tchar, clucene 用的是w_char
using namespace lucene::index;
using namespace lucene::analysis;
using namespace lucene::util;
using namespace lucene::document;
using namespace lucene::store;
using namespace lucene::queryParser;
using namespace lucene::search;
#ifndef SAFE_DELETE
#define SAFE_DELETE(p) \
if (NULL != (p)) { \
delete (p); \
(p) = NULL; \
}
#endif // #ifndef SAFE_DELETE
#define TITLE_LINE80 "================================================================================"
#define LINE80 "--------------------------------------------------------------------------------"
#if not defined(MYLOG_D)
#define MYLOG_D printf
#endif
#define MY_LUCENE_DATA_DIR "/home/dev/lu_dir/"
#define SAFE_DEL_CLUCENE_POINTER _CLDELETE
void init(const char* psz_log_owner_name);
void uninit();
void proc_sig_term(int num);
int fn_test();
bool add_doc_to_writer(IndexWriter* lucene_writer, const TCHAR* psz_key, const TCHAR* psz_value);
void write_to_clucene();
void query_from_clucene(const TCHAR* psz_key, const TCHAR* psz_query);
int main(int argc, char** argv)
{
char sz_buf[1024] = {'\0'};
#ifdef MAKE_FILE_MACRO__BIN_NAME
sprintf(sz_buf, "%s", MAKE_FILE_MACRO__BIN_NAME);
init(sz_buf);
MYLOG_D("MAKE_FILE_MACRO__BIN_NAME = [%s]\n", MAKE_FILE_MACRO__BIN_NAME);
#else
init(NULL);
#endif // #ifdef MAKE_FILE_MACRO__BIN_NAME
fn_test();
uninit();
MYLOG_D("THE END\n");
return EXIT_SUCCESS;
}
void uninit()
{
}
void proc_sig_term(int num)
{
MYLOG_D("SIGTERM = %d, num = %d\n", SIGTERM, num);
MYLOG_D("maybe can do some clean task before quit\n");
exit(1);
}
void init(const char* psz_log_owner_name)
{
int i = 0;
// daemon(0, 0);
// clear screen (print 25 empty line)
for (i = 0; i < 25; i++) {
MYLOG_D("\n");
}
signal(SIGTERM, proc_sig_term);
}
int fn_test()
{
MYLOG_D(">> fn_test()\n");
write_to_clucene();
// clucene的查询统配符号只支持'?', '*'
// 查询的内容只支持英文字母(a~z, A~Z)
// 查询字符串中不能带非英文字母(e.g. '.', '//', ':'), 否则查不到内容
query_from_clucene(_T("k1"), _T("*")); // 查询条件不能为空, 如果要查所有符合条件的vlaue, 输入*
query_from_clucene(_T("url"), _T("*bing*"));
query_from_clucene(_T("url"), _T("csdn.")); // 这种不带英文字母的查询语句,是查不到内容的
query_from_clucene(_T("url"), _T("csdn"));
query_from_clucene(_T("url"), _T("b??g"));
/** run result
*/
return 0;
}
void write_to_clucene()
{
SimpleAnalyzer lucene_analyzer;
FSDirectory* lucene_fs_dir = NULL;
IndexWriter* lucene_writer = NULL;
MYLOG_D(">> write_to_clucene()\n");
try {
lucene_fs_dir = FSDirectory::getDirectory(MY_LUCENE_DATA_DIR);
MYLOG_D("create file system writer\n");
lucene_writer = _CLNEW IndexWriter(lucene_fs_dir, &lucene_analyzer, true);
lucene_writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
add_doc_to_writer(lucene_writer, _T("url"), _T("https://me.csdn.net"));
add_doc_to_writer(lucene_writer, _T("url"), _T("http://www.csdn.net/"));
add_doc_to_writer(lucene_writer, _T("url"), _T("https://blog.csdn.net"));
add_doc_to_writer(lucene_writer, _T("url"), _T("https://www.baidu.com/"));
add_doc_to_writer(lucene_writer, _T("url"), _T("http://cn.bing.com"));
add_doc_to_writer(lucene_writer, _T("k1"), _T("v1 is the value, k1 is the key"));
add_doc_to_writer(lucene_writer, _T("k1"), _T("v1_1"));
add_doc_to_writer(lucene_writer, _T("k1"), _T("v1_2"));
lucene_writer->close();
MYLOG_D("close file system writer\n");
}
catch (CLuceneError e) {
MYLOG_D("catch clucene error : %s\n", e.what());
}
SAFE_DEL_CLUCENE_POINTER(lucene_writer);
SAFE_DEL_CLUCENE_POINTER(lucene_fs_dir);
}
bool add_doc_to_writer(IndexWriter* lucene_writer, const TCHAR* psz_key, const TCHAR* psz_value)
{
bool b_rc = false;
Document* lucene_doc = NULL;
do {
if ((NULL == lucene_writer) || (NULL == psz_key) || (NULL == psz_value)) {
break;
}
lucene_doc = _CLNEW Document();
if (NULL == lucene_doc) {
break;
}
lucene_doc->add(*_CLNEW Field(psz_key, psz_value, Field::STORE_YES | Field::INDEX_TOKENIZED));
lucene_writer->addDocument(lucene_doc);
SAFE_DEL_CLUCENE_POINTER(lucene_doc);
b_rc = true;
} while (0);
return b_rc;
}
void query_from_clucene(const TCHAR* psz_key, const TCHAR* psz_query)
{
std::wstring str1 = L"";
float f_score = 0.0f;
uint32_t i = 0;
Term* lucene_term = NULL;
Query* lucene_query = NULL;
Hits* lucene_hits = NULL;
FSDirectory* clucene_dir = NULL;
IndexReader* clucene_reader = NULL;
IndexSearcher* clucene_searcher = NULL;
MYLOG_D("\n\n");
MYLOG_D("%s\n", TITLE_LINE80);
MYLOG_D(">> query_from_clucene(%ls, %ls)\n", psz_key, psz_query);
MYLOG_D("%s\n", TITLE_LINE80);
try {
lucene_term = _CLNEW Term(psz_key, psz_query);
lucene_query = _CLNEW WildcardQuery(lucene_term);
clucene_dir = FSDirectory::getDirectory(MY_LUCENE_DATA_DIR);
clucene_reader = IndexReader::open(clucene_dir);
clucene_searcher = _CLNEW IndexSearcher(clucene_reader);
lucene_hits = clucene_searcher->search(lucene_query);
MYLOG_D("lucene_hit cnt = %lu\n", lucene_hits->length());
for (i = 0; i < lucene_hits->length(); i++ ) {
MYLOG_D("%s\n", LINE80);
Document& lucene_doc = lucene_hits->doc(i);
MYLOG_D("lucene_doc->toString() = %ls\n", lucene_doc.toString());
str1 = lucene_doc.get(psz_key);
f_score = lucene_hits->score(i);
MYLOG_D("%d. query = %ls, key = %ls, value = %ls, score = %f\n",
i, psz_query, psz_key, str1.c_str(), f_score); // 打印宽字符的例子(使用printf + %ls)
}
}
catch (CLuceneError e) {
MYLOG_D("catch clucene error : %s\n", e.what());
}
//free resource
SAFE_DEL_CLUCENE_POINTER(lucene_term);
SAFE_DEL_CLUCENE_POINTER(lucene_hits);
SAFE_DEL_CLUCENE_POINTER(lucene_query);
SAFE_DEL_CLUCENE_POINTER(clucene_searcher);
SAFE_DEL_CLUCENE_POINTER(clucene_reader);
SAFE_DEL_CLUCENE_POINTER(clucene_dir);
}
# ==============================================================================
# @file makefile
# ==============================================================================
# @note
# howto build project
# make BIN_NAME="bin_name_by_you_want" rebuild
MY_MAKE_FILE_PATH_NAME = $(MAKEFILE_LIST)
# macro from Makefile command line
# BIN_NAME
# macro to C project
MAKE_FILE_MACRO__BIN_NAME="make_file_macro__bin_name"
# var define on Makefile
BIN = output_not_give_bin_name
IS_BUILD_TYPE_VALID = 0
ifdef BIN_NAME
IS_BUILD_TYPE_VALID = 1
BIN = $(BIN_NAME)
MAKE_FILE_MACRO__BIN_NAME=$(BIN_NAME)
else
IS_BUILD_TYPE_VALID = 0
endif
LINE80 = --------------------------------------------------------------------------------
# CC = g++ -std=c++98
CC = g++
# -Werror is "warning as error"
CFLAGS = -Wall -Werror -g
INC = -I. -I./clucene-core-2.3.3.4/src/core/ -I./clucene-core-2.3.3.4/src/shared/CLucene/
LIBPATH = -L/usr/lib/ -L/usr/local/lib/
ifeq (1, $(IS_BUILD_TYPE_VALID))
LIBS = -lstdc++ -pthread -lclucene-core -lclucene-shared
else
LIBS =
endif
DEPEND_CODE_DIR = ../common/ \
DEPEND_CODE_SRC = $(shell find $(DEPEND_CODE_DIR) -name '*.cpp')
DEPEND_CODE_OBJ = $(DEPEND_CODE_SRC:.cpp=.o)
ROOT_CODE_SRC = $(shell find ./ -name '*.cpp')
ROOT_CODE_OBJ = $(ROOT_CODE_SRC:.cpp=.o)
SUB_CODE_DIR = ./empty_dir
SUB_CODE_SRC = $(shell find $(SUB_CODE_DIR) -name '*.cpp')
SUB_CODE_OBJ = $(SUB_CODE_SRC:.cpp=.o)
.PHONY: help
help:
clear
@echo "usage:"
@echo
@echo "build project by given bin name"
@echo "make BIN_NAME=\"bin_name_by_you_want\" rebuild"
@echo
.PHONY: clean
clean:
clear
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo
@echo "make clean begin"
@echo $(LINE80)
@echo "@file $(MY_MAKE_FILE_PATH_NAME)"
@echo "IS_BUILD_TYPE_VALID = $(IS_BUILD_TYPE_VALID)"
@echo "BIN = $(BIN)"
@echo $(LINE80)
rm -f $(ROOT_CODE_OBJ) $(DEPEND_CODE_OBJ) $(SUB_CODE_OBJ)
ifeq (1, $(IS_BUILD_TYPE_VALID))
rm -f ./$(BIN)
endif
@echo "make clean over"
.PHONY: all
all:$(BIN)
@echo $(LINE80)
@echo make all
chmod 777 ./$(BIN)
find . -name "$(BIN)"
$(BIN) : $(ROOT_CODE_OBJ) $(DEPEND_CODE_OBJ) $(SUB_CODE_OBJ)
$(CC) $(CFLAGS) -o $@ $^ $(SHLIBS) $(INC) $(LIBPATH) $(LIBS)
.cpp.o:
$(CC) -c $(CFLAGS) -DMAKE_FILE_MACRO__BIN_NAME="\"$(MAKE_FILE_MACRO__BIN_NAME)\"" $^ -o $@ $(INC) $(LIBPATH) $(LIBS)
.PHONY: rebuild
rebuild:
make -f $(MY_MAKE_FILE_PATH_NAME) clean
ifeq (1, $(IS_BUILD_TYPE_VALID))
@echo $(LINE80)
make -f $(MY_MAKE_FILE_PATH_NAME) all
chmod 775 ./$(BIN)
ldd ./$(BIN)
else
@echo $(LINE80)
@echo "error : Makefile command line input error, please see help"
@echo "please run => make help"
@echo $(LINE80)
endif
#!/bin/bash
# ==============================================================================
# @file build_all_project.sh
# ==============================================================================
make BIN_NAME="test_clucene" rebuild