对于矩阵运算,著名的C++库有Eigen与C++版本的Numpy——NumCpp,Python库有NumPy与带GPU加速的Numpy版本CuPy。博客园|Eigen vs Numpy时间对比中对Eigen与Numpy的效率进行了比较,但是比较时忽略了C++的编译模式[DeBug与Release]对结果的影响,对比出的结果是不够准确的。本篇对Eigen、NumCpp、NumPy以及CuPy四种主流矩阵运算库的计算效率进行比较,以备实战时作为考量的一点依据。
本博客主要测试四种库对静态矩阵与动态矩阵运算的效率,主要分为两个项目:一)静态矩阵计算;二)动态矩阵计算。
假定有向量集 D = { d ⃗ 1 ∈ R M × 1 , . . . , d ⃗ N ∈ R M × 1 } D=\{\vec{d}_{1}\in R^{M\times 1},...,\vec{d}_{N}\in R^{M\times 1}\} D={d1∈RM×1,...,dN∈RM×1}与向量 x ⃗ ∈ R M × 1 \vec{x}\in R^{M\times 1} x∈RM×1,测试计算以下函数为:
y i = exp ( − ∣ ∣ x ⃗ − d ⃗ i ∣ ∣ 2 2 δ 2 ) , i = 1 , 2 , . . . , 2000 y_{i}=\exp(\frac{-||\vec{x}-\vec{d}_{i}||^{2}}{2\delta^2}),i=1,2,...,2000 yi=exp(2δ2−∣∣x−di∣∣2),i=1,2,...,2000
测试数据总共分成三类:
1)小型矩阵与向量计算,
N
=
M
=
20
N=M=20
N=M=20;
2)中型矩阵与向量计算,
N
=
M
=
200
N=M=200
N=M=200;
3) 大型矩阵与向量计算,
N
=
M
=
2000
N=M=2000
N=M=2000。
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include "iomanip"
#include <Eigen/Dense>
#include "NumCpp.hpp"
#include "boost/filesystem.hpp"
using namespace Eigen;
using namespace std;
int M = 2000; // 20 200 2000
int N = 2000; // 20 200 2000
Eigen::RowVectorXd construct_Phi2eigen(Eigen::Vector2d x, Eigen::Matrix2Xd ALD_mat)
{
double delta = 1.2;
Eigen::Matrix2Xd dmat = ALD_mat.colwise() - x;
Eigen::Matrix2Xd dmat2 = dmat.array().pow(2);
Eigen::RowVectorXd dmat2sum = -dmat2.colwise().sum()/(2.0*delta*delta);
Eigen::RowVectorXd y = dmat2sum.array().exp();
return y;
}
nc::NdArray<double> contruct_Phi2numcpp(nc::NdArray<double> x, nc::NdArray<double> ALD_buf)
{
double delta = 1.2;
nc::NdArray<double> c = ALD_buf - nc::tile(x, {ALD_buf.shape().rows , 1});
nc::NdArray<double> e = nc::exp(-nc::sum(nc::power(c, 2), nc::Axis::COL)/(2.0*delta*delta));
return e;
}
int main()
{
Eigen::Matrix2Xd X_mat = Eigen::Matrix2Xd::Random(M, N);
Eigen::Vector2d x = Eigen::Vector2d::Random(M, 1);
struct timeb startTime, endTime;
std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
auto phi2 = construct_Phi2eigen(x, X_mat);
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
nc::NdArray<double> D_arr = nc::random::rand<double>({N, M});
nc::NdArray<double> x1 = nc::random::rand<double>({1, M});
std::cout<<"test the time cost for numcpp"<<std::endl;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
auto phi3 = contruct_Phi2numcpp(x1, D_arr);
// std::cout<<phi3<<std::endl;
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include"iomanip"
#include <Eigen/Dense>
#include"NumCpp.hpp"
#include"boost/filesystem.hpp"
using namespace Eigen;
using namespace std;
int M = 2; // 2 20 200
int N = 2; // 2 20 200
Eigen::RowVectorXf construct_Phi2eigen(Eigen::VectorXf x, Eigen::MatrixXf ALD_mat)
{
float delta = 1.2;
Eigen::MatrixXf dmat = ALD_mat.colwise() - x;
Eigen::MatrixXf dmat2 = dmat.array().pow(2);
Eigen::RowVectorXf dmat2sum = -dmat2.colwise().sum()/(2.0f*delta*delta);
Eigen::RowVectorXf y = dmat2sum.array().exp();
return y;
}
nc::NdArray<float> contruct_Phi2numcpp(nc::NdArray<float> x, nc::NdArray<float> ALD_buf)
{
float delta = 1.2;
nc::NdArray<float> c = ALD_buf - nc::tile(x, {ALD_buf.shape().rows , 1});
nc::NdArray<float> e = nc::exp(-nc::sum(nc::power(c, 2), nc::Axis::COL)/(2.0f*delta*delta));
return e;
}
int main()
{
Eigen::MatrixXf X_mat = Eigen::MatrixXf::Random(M, N);
Eigen::VectorXf x = Eigen::VectorXf::Random(M, 1);
struct timeb startTime, endTime;
std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
auto phi2 = construct_Phi2eigen(x, X_mat);
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
nc::NdArray<float> x1 = nc::random::rand<float>({1, M});
nc::NdArray<float> D_arr = nc::random::rand<float>({N, M});
std::cout<<"test the time cost for numcpp"<<std::endl;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
auto phi3 = contruct_Phi2numcpp(x1, D_arr);
// std::cout<<phi3<<std::endl;
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}
import cupy as cp
import numpy as np
import time
delta = 1.2
M = 2 # 2 20 200
N = 2 # 2 20 200
x_cp = cp.random.rand(1,M)
D_arr_cp = cp.random.rand(N, M)
x_np = np.random.rand(1,M)
D_arr_np = np.random.rand(N, M)
def construct_Phi2_cp(x, ALD_buf):
ALD_arr = cp.array(ALD_buf)
x = cp.tile(x, (len(ALD_buf), 1))
dX = ALD_arr - x
return cp.exp(-cp.sum(cp.power(dX, 2), axis=1)/2*delta*delta)
def construct_Phi2_np(x, ALD_buf):
ALD_arr = np.array(ALD_buf)
x = np.tile(x, (len(ALD_buf), 1))
dX = ALD_arr - x
return np.exp(-np.sum(np.power(dX, 2), axis=1)/2*delta*delta)
def testtimecp():
for i in range(1000):
construct_Phi2_cp(x_cp, D_arr_cp)
def testtimenp():
for i in range(1000):
construct_Phi2_np(x_np, D_arr_np)
print("time cost for cupy:")
%time testtimecp()
print("time cost for numpy")
%time testtimenp()
矩阵大小 | M=2, N=2 | M=20, N=20 | M=200, N=200 | M=200, N=400 | M=400, N=200 |
---|---|---|---|---|---|
Eigen | f(6ms)、d(6ms) | f(22ms)、d(30ms) | f(685ms)、d(2.283s) | f(2.222s)、d(4.845s)) | f(1.567s)、d(4.843s) |
NumCpp | f(10ms)、d(11ms) | f(38ms)、d(32ms) | f(1.51s)、d(1.854) | f(3.035s)、d(3.619s) | f(3.255s)、d(7.278s) |
NumPy | 195ms | 234ms | 3.25s | 6.32s | 6.32s |
CuPy | 1.83s | 1.75s | 1.75s | 1.71s | 1.81s |
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include"iomanip"
#include <Eigen/Dense>
#include"NumCpp.hpp"
#include"boost/filesystem.hpp"
using namespace Eigen;
using namespace std;
int M = 200; // 2 20 200
int N = 200; // 2 20 200
int main()
{
Eigen::Matrix2Xf X_mat = Eigen::Matrix2Xf::Random(M, N);
Eigen::Vector2f x = Eigen::Vector2f::Random(M, 1);
std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
struct timeb startTime, endTime;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
X_mat = (Eigen::MatrixXf(X_mat.rows(), X_mat.cols()+x.cols())<<X_mat, x).finished();
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
nc::NdArray<float> x1 = nc::random::rand<float>({M, 1});
nc::NdArray<float> D_arr = nc::random::rand<float>({M, N});
std::cout<<"test the time cost for numcpp"<<std::endl;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
D_arr = nc::column_stack({D_arr, x1});
// std::cout<<phi3<<std::endl;
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <sys/timeb.h>
#include"iomanip"
#include <Eigen/Dense>
#include"NumCpp.hpp"
#include"boost/filesystem.hpp"
using namespace Eigen;
using namespace std;
int M = 200; // 2 20 200
int N = 200; // 2 20 200
int main()
{
Eigen::MatrixXd X_mat = Eigen::MatrixXd::Random(M, N);
Eigen::MatrixXd x = Eigen::VectorXd::Random(M, 1);
std::cout<<"test the time cost for Eigen with matrix"<<std::endl;
struct timeb startTime, endTime;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
X_mat = (Eigen::MatrixXd(X_mat.rows(), X_mat.cols()+x.cols())<<X_mat, x).finished();
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
nc::NdArray<double> x1 = nc::random::rand<double>({M, 1});
nc::NdArray<double> D_arr = nc::random::rand<double>({M, N});
std::cout<<"test the time cost for numcpp"<<std::endl;
ftime(&startTime);
for (size_t i = 0; i < 10000; i++)
{
D_arr = nc::column_stack({D_arr, x1});
// std::cout<<phi3<<std::endl;
}
ftime(&endTime);
std::cout << "time cost is:" << (endTime.time-startTime.time)*1000 + (endTime.millitm - startTime.millitm) << "ms" << std::endl;
}
import cupy as cp
import numpy as np
import time
M = 2 # 2 20 200
N = 2 # 2 20 200
x1_np = np.random.rand(M, 1)
D_arr_np = np.random.rand(M, N)
x1_cp = cp.random.rand(M, 1)
D_arr_cp = cp.random.rand(M, N)
def testtimecp():
global D_arr_cp
for i in range(10000):
D_arr_cp = cp.column_stack((D_arr_cp, x1_cp))
def testtimenp():
global D_arr_np
for i in range(10000):
D_arr_np = np.column_stack((D_arr_np, x1_np))
print("time cost for cupy:")
%time testtimecp()
print("time cost for numpy")
%time testtimenp()
矩阵大小 | M=2, N=2 | M=20, N=20 | M=200, N=200 |
---|---|---|---|
Eigen | f(203ms)、d(377ms) | f(1.427s)、d(2.875s) | f(20.29s)、d(42.458s) |
NumCpp | f(130ms)、d(144ms) | f(1.416s)、d(1.319s) | f(15.673s)、d(18.797s) |
NumPy | 54ms | 324ms | 8.36s |
CuPy | 308ms | 312ms | 3.03s |
写了一个动态RBF网络,原来一直在python语言下用,采用NumPy实现。最近需要在C++环境中使用,于是用Eigen实现了。发现,Eigen实现版本的计算成本比NumPy还高。于是又实现了NumCpp版本,发现还是NumPy效率最高。经过以上试验,终于窥见一些缘由:算法里同时需要大量矩阵乘、加、减运算,同时又有大量动态矩阵运算(增加一行或一列)。矩阵乘、加、减的运算相差较各库的动态矩阵计算效率相差小。这也是为什么会产生与直觉不一样的结果。其中,double型计算更加耗时。。。注意和下面结果相矛盾!!!
最终,我没用Eigen,而是采用C++调用Pyhon(也即NumPy实现)的方案。
下面列出动态RBF网络算法利用各库实现的计算成本:
方法 | Eigen | NumCpp | NumPy |
---|---|---|---|
time cost | f(2.575s)、d(1.355s) | f(12.048s)、d(5.707s) | 770ms |
其中,float型计算更加耗时。。。怎么和上面结果相矛盾!!!
C++编译存在DeBug与Release两种方式,DeBug一般用于调试,编译的版本会存储大量调试信息,实际运行时并不能反应算法的真实计算成本。一定要利用Release方式编译测试代码。
这里要感谢同事小张的提醒。最初测试时,发现Eigen和NumCpp的效率都要比NumPy要差。今天午饭时跟小张抱怨这个疑惑时,小张便给出问题的所在。
在CMakeLists.txt文件里添加下面两行,启动Release编译方式,并启动最高效率编译模式。
SET(CMAKE_BUILD_TYPE "Release")
by windSeS
2020-10-30
2020-11-5修补