Vearch(https://github.com/vearch/vearch) 是一个可以在海量特征中快速检索出相似结果的弹性分布式系统。具体的介绍可以参考vearch的官方文档https://vearch.readthedocs.io/zh_CN/latest/overview.html
CentOS, Ubuntu and Mac OS are all OK (recommend CentOS >= 7.2),cmake required
Go >= 1.11.2 required
Gcc >= 5 required
# faiss是vearch引擎的依赖,是必须安装的
Faiss >= v1.6.0
# RocksDB是vearch磁盘版数据的存储引擎
RocksDB == 6.2.2 (optional) .
#这个是要源码编译python sdk的时候需要的,这里建议直接使用pip install vearch
swig >= 3
# 关于GPU,我准备单独一个文章介绍,这里就先略过
CUDA >= 9.0, if you want GPU support.
所以这里我们准备安装的就是 go、gcc、 faiss和rocksdb
# 首先定义一个vearch的目录,这里暂定为/home/vearch,接下的操作都将在这个目录下
mkdir -p /home/vearch && cd /home/vearch
yum install golang
yum install gcc
#安装rocksdb参考https://github.com/facebook/rocksdb/blob/master/INSTALL.md
#安装faiss参考https://github.com/facebookresearch/faiss/blob/master/INSTALL.md
# 安装完rocksdb和faiss将其中的so包和include文件夹挪到对应的位置
# 创建vearch的依赖文件夹vearch_libs
mkdir vearch_libs && cd vearch_libs
# 对于rocksdb,这里下载的是rocksdb-6.2.2
git clone https://github.com/facebook/rocksdb.git
cd rocksdb && make shared_lib
# 拷贝相关文件到指定文件夹
mkdir -p /home/vearch/vearch_libs/rocksdb-6.2.2-install/lib/
cp librocksdb.so librocksdb.so.6 librocksdb.so.6.2 librocksdb.so.6.2.2 /home/vearch/vearch_libs/rocksdb-6.2.2-install/lib/
cp -r include /home/vearch/vearch_libs/rocksdb-6.2.2-install/
# 对于faiss
git clone https://github.com/facebookresearch/faiss.git
cd faiss && ./configure --without-cuda && make
# 拷贝相关文件到指定文件夹
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/lib
cp libfaiss.a libfaiss.so /home/vearch/vearch_libs/faiss-1.6.0-install/lib
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/
cp *.h /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/impl
cp impl/*.h /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/impl
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/util
cp util/*.h /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/util
# 定义vearch的编译路径,这个最好不要改
mkdir -p /home/vearch/go/src/github/vearch
export GOPATH=/home/vearch/go
cd /home/vearch/go/src/github/vearch
git clone https://github.com/vearch/vearch.git
# 过程有点慢,需要细心等待,或者直接下载zip包,然后本地解压
# unzip vearch_master.zip && mv vearch_master vearch && rm vearch_master.zip
cd vearch/build
# 在这里要分别编译引擎和vearch,为了方便这里写了个脚本run.sh
# export GOPATH=/home/vearch/go
# export FAISS_HOME=/home/vearch/vearch_libs/faiss-1.6.0-install/
# export ROCKSDB_HOME=/home/vearch/vearch_libs/rocksdb-6.2.2-install/
# export LD_LIBRARY_PATH=$FAISS_HOME/lib:$ROCKSDB_HOME/lib:$LD_LIBRARY_PATH
# ./build.sh
bash run.sh
如果顺利,你就完成了vearch的编译.
1、定义部署文件夹
mkdir -p /home/vearch/deploy && cd /home/vearch/deploy
cp /home/vearch/go/src/github/vearch/vearch/build/bin/* ./
mkdir -p /home/vearch/deploy && cd /home/vearch/deploy/copy/lib
cp /home/vearch/go/src/github/vearch/vearch/build/gamma_build/libgamma.so* ./
cp /home/vearch/vearch_libs/faiss-1.6.0-install/lib/* ./
cp /home/vearch/vearch_libs/rocksdb-6.2.2-install/lib/* ./
vim conf.toml # 内容如下
2、编写conf.toml文件
[global]
# the name will validate join cluster by same name
name = "vearch"
# you data save to disk path ,If you are in a production environment, You'd better set absolute paths
data = ["/home/vearch/Data/baud/datas/"]
# log path , If you are in a production environment, You'd better set absolute paths
log = "/home/vearch/Data/baud/logs/"
# default log type for any model
level = "debug"
# master <-> ps <-> router will use this key to send or receive data
signkey = "vearch"
skip_auth = true
# if you are master you'd better set all config for router and ps and router and ps use default config it so cool
[[masters]]
# name machine name for cluster
name = "master1"
# ip or domain
address = "127.0.0.1"
# api port for http server
api_port = 8817
# port for etcd server
etcd_port = 2378
# listen_peer_urls List of comma separated URLs to listen on for peer traffic.
# advertise_peer_urls List of this member's peer URLs to advertise to the rest of the cluster. The URLs needed to be a comma-separated list.
etcd_peer_port = 2390
# List of this member's client URLs to advertise to the public.
# The URLs needed to be a comma-separated list.
# advertise_client_urls AND listen_client_urls
etcd_client_port = 2370
skip_auth = true
[router]
# port for server
port = 9001
# skip auth for client visit data
skip_auth = true
[ps]
# port for server
rpc_port = 8081
# raft config begin
raft_heartbeat_port = 8898
raft_replicate_port = 8899
heartbeat-interval = 200 #ms
raft_retain_logs = 10000
raft_replica_concurrency = 1
raft_snap_concurrency = 1
编写完配置文件就可以直接运行了,再加上两个脚本
start.sh
#!/usr/bin/env bash
BasePath=$(cd `dirname $0`; pwd)
cd $BasePath
function getServiceStatusInfo {
pidFile=$1
filterTag=$2
if [ ! -f "${pidFile}" ]; then
echo ""
else
ps -ef|grep `cat ${pidFile}`|grep -v grep|grep ${filterTag}
fi
}
function start
{
stype=$1
info=$(getServiceStatusInfo "${stype}.pid" "${stype}")
if [ -z "$info" ]; then
export LD_LIBRARY_PATH=$BasePath/copy/lib/:$LD_LIBRARY_PATH
nohup $BasePath/vearch -conf $BasePath/conf.toml $1 > $BasePath/vearch-${stype}-startup.log 2>&1 &
pid=$!
echo $pid > $BasePath/${stype}.pid
echo "[INFO] ${stype} started... pid:${pid}"
else
echo "[Error]The ${stype} is running and the ${stype}'s status is :"
echo "[INFO] status of ${stype} : ${info}"
fi
echo "--------------------------------------------------------------------------"
}
if [ -z "$1" ]; then
echo "start args is empty"
fi
if [ -n "$1" ]; then
if [ $1 == "all" ]; then
start master
start router
start ps
else
start $1
fi
fi
stop.sh
#!/usr/bin/env bash
BasePath=$(cd `dirname $0`; pwd)
cd $BasePath
function getServiceStatusInfo {
pidFile=$1
filterTag=$2
if [ ! -f "${pidFile}" ]; then
echo ""
else
ps -ef|grep `cat ${pidFile}`|grep -v grep|grep ${filterTag}
fi
}
function stop
{
stype=$1
echo " [INFO] Stoping ${stype} :"
info=$(getServiceStatusInfo "$BasePath/${stype}.pid" "${stype}")
if [ -z "$info" ]; then
echo " [INFO] There is no ${stype}'s pid file!"
else
echo "${info}"|awk '{print $2}'|xargs kill -9
/bin/rm -f ${BasePath}/${stype}.pid
fi
}
if [ $1 == "all" ]; then
stop master
stop router
stop ps
else
stop $1
fi
然后执行
./start.sh all
# 如果看到类似下面的内容
[INFO] master started... pid:39465
--------------------------------------------------------------------------
[INFO] router started... pid:39467
--------------------------------------------------------------------------
[INFO] ps started... pid:39469
--------------------------------------------------------------------------
# 服务就启动成功了
多机版和单机版类似,只是conf.toml文件和启动方式略微不同
# 首先编写个机器的配置文件
vim conf.json # 内容如下:
#########打开vim##########
{
"user": "root",
"password": "123456",
"dir": "/home/vearch/deploy",
"master": ["ip1", "ip2", "ip3"],
"ps": ["ip4", "ip5", "ip6"],
"router": ["ip7", "ip8", "ip9"],
"port":"22",
"copy":["lib"]
}
#########关闭vim##########
# ip1-9只是代号,每个ip列表的个数可以随意指定,master、ps和router的列表可以相同
# 然后编写conf.toml,与单机版基本相同只是添加了几个master
vim conf.toml
#########打开vim##########
[global]
# 和单机版相同,这里省略了
[[masters]]
name = "master1"
address = "ip1"
api_port = 8817
etcd_port = 2378
etcd_peer_port = 2390
etcd_client_port = 2370
skip_auth = true
[[masters]]
name = "master2"
address = "ip2"
api_port = 8817
etcd_port = 2378
etcd_peer_port = 2390
etcd_client_port = 2370
skip_auth = true
[[masters]]
name = "master3"
address = "ip3"
api_port = 8817
etcd_port = 2378
etcd_peer_port = 2390
etcd_client_port = 2370
skip_auth = true
[router]
# 和单机版相同,这里省略了
[ps]
# 和单机版相同,这里省略了
#########关闭vim##########
#开始部署 并检查部署状态
readonly cur_dir=`dirname $(readlink -f "$0")`
./batch_deployment -dir=$cur_dir vearch deploy
./batch_deployment -dir=$cur_dir vearch start
./batch_deployment -dir=$cur_dir vearch status
#销毁
./batch_deployment -dir=$cur_dir vearch destroy
接下来做基本的功能测试,可以参考vearch的官方文档
# create a db which name test
curl -XPUT -H "content-type:application/json" -d '{"name": "test"}' http://127.0.0.1:8817/db/_create
# create a space in test db which name test too.
curl -XPUT -H "content-type: application/json" -d' { "name": "test", "dynamic_schema": "strict", "partition_num": 1, "replica_num": 1, "engine": {"name":"gamma","metric_type": "InnerProduct"}, "properties": { "url": { "type": "keyword", "index":true}, "feature1": { "type": "vector", "dimension":2, "format": "normalization" }}} ' http://127.0.0.1:8817/space/test/_create
# single insert
curl -XPOST -H "content-type: application/json" -d' { "url": "1", "feature1":{"feature":[0.1,0.2]}} ' http://127.0.0.1:9001/test/test
curl -XPOST -H "content-type: application/json" -d' { "url": "2", "feature1":{"feature":[0.2,0.2]}} ' http://127.0.0.1:9001/test/test
curl -XPOST -H "content-type: application/json" -d' { "url": "3", "feature1":{"feature":[0.3,0.2]}} ' http://127.0.0.1:9001/test/test
# Search similar result from space
curl -H "content-type: application/json" -XPOST -d '{ "query": { "sum": [{"feature":[0.3,0.2], "field":"feature1"}]}}' http://127.0.0.1:9001/test/test/_search
#幸运的话你会看到下面的结果
{
"took":6,
"timed_out":false,
"_shards":{
"total":1,
"failed":0,
"successful":1
},
"hits":{
"total":3,
"max_score":0.36055511236190796,
"hits":[
{
"_index":"test",
"_type":"test",
"_id":"AXEb_HiWDMR65Skqx3mn",
"_score":0.36055511236190796,
"_extra":{
"vector_result":[
{
"field":"feature1",
"source":"",
"score":0.36055511236190796
}
]
},
"_version":1,
"_source":{
"url":"3"
}
},
{
"_index":"test",
"_type":"test",
"_id":"AXEb_GDGDMR65Skqx3ml",
"_score":0.3535534143447876,
"_extra":{
"vector_result":[
{
"field":"feature1",
"source":"",
"score":0.3535534143447876
}
]
},
"_version":1,
"_source":{
"url":"2"
}
},
{
"_index":"test",
"_type":"test",
"_id":"AXEb_DwYDMR65Skqx3mj",
"_score":0.3130495250225067,
"_extra":{
"vector_result":[
{
"field":"feature1",
"source":"",
"score":0.31304952502250671
}
]
},
"_version":1,
"_source":{
"url":"1"
}
}
]
}
}
细心的你会发现计算出来的分值有点怪,[0.3, 0.2]和[0.3, 0.2]的内积得到的值居然是0.36055511236190796,这是因为如果建表的时候选择的是度量方式是InnerProduct,vearch插入的时候会帮你把向量正则化,即每一维度都除以该向量的范数,所以[0.3, 0.2]在vearch中的值其实是
[
0.3
(
0.
3
2
+
0.
2
2
)
,
0.2
(
0.
3
2
+
0.
2
2
)
]
\lbrack \frac{0.3}{\sqrt{(0.3^2+0.2^2)}}, \frac{0.2}{\sqrt{(0.3^2+0.2^2)}} \rbrack
[(0.32+0.22)0.3,(0.32+0.22)0.2]
, 然而搜索的时候vearch并没有帮你进行处理,所以得到的分值就变成了
[
0.3
,
0.2
]
∗
[
0.3
(
0.
3
2
+
0.
2
2
)
,
0.2
(
0.
3
2
+
0.
2
2
)
]
=
0.13
≈
0.36
[0.3, 0.2]*[\frac{0.3}{\sqrt{(0.3^2+0.2^2)}}, \frac{0.2}{\sqrt{(0.3^2+0.2^2)}}]= \sqrt{0.13}\approx 0.36
[0.3,0.2]∗[(0.32+0.22)0.3,(0.32+0.22)0.2]=0.13≈0.36
至此,vearch的整个流程都已经走通,下面会再介绍一线基于vearch的一些应用,例如
基于vearch快速实现图像检索系统
基于vearch快速实现人脸识别系统
致谢:
https://github.com/vearch/vearch
https://github.com/facebookresearch/faiss
https://github.com/facebook/rocksdb