我想使用寄存器(逐行信息)通过向量算法创建矩阵乘法。打开外循环4次我有空洞matvec_XMM(双* a,双* x,双* y,整数n,整数磅)函数的问题,它返回了不好的结果,这是算法wchich我必须使用:
i = 1,n,4
r0 = r1 = r2 = r3 = 0
j = 1,n,8
r0 = r0 + aij * xj + ai,j+1 * xj+1 + … + ai,j+7 * xj+7
r1 = r1 + ai+1,j * xj + ai+1,j+1 * xj+1 + … + ai+1,j+7 * xj+7
r2 = r2 + ai+2,j * xj + ai+2,j+1 * xj+1 + … + ai+2,j+7 * xj+7
r3 = r3 + ai+3,j * xj + ai+3,j+1 * xj+1 + … + ai+3,j+7 * xj+7
end j
yi = r0; yi+1 = r1; yi+2 = r2; yi+3 = r3;
end i
它是ma代码:
#include "stdafx.h"
#include <iostream>
#include "mvec.h"
#include <emmintrin.h>
using namespace std;
void mult_naive(double *a, double *x, double *y, int n)
{
int i, j, ij;
double register reg;
for(i=0, ij=0; i<n; ++i)
{
reg = 0;
for(j=0; j<n; ++j, ++ij)
{
reg += a[ij]*x[j];
}
y[i] = reg;
}
}
void matvec_XMM(double* a, double* x, double* y, int n, int lb)
{
int i, j;
memset((void *)y, 0, n*sizeof(double));
double res0[2];
double res1[2];
double res2[2];
double res3[2];
__m128d ry0, ry1, ry2, ry3, ra0, rx0;
double *ptr_a, *ptr_x, *ptr_y;
const int nr = 4;
ptr_a = a;
for (i = 0; i < n; i+=nr)
{
ry0 = _mm_setzero_pd();
ry1 = _mm_setzero_pd();
ry2 = _mm_setzero_pd();
ry3 = _mm_setzero_pd();
ptr_y = &y[i];
ptr_x = x;
for (j = 0; j<n; j+=lb)
{
_mm_prefetch((const char *)(ptr_a + lb*nr), _MM_HINT_NTA);
_mm_prefetch((const char *)(ptr_x + lb), _MM_HINT_T0);
//----1
rx0 = _mm_load_pd(ptr_x);
ra0 = _mm_load_pd(ptr_a);
ra0 = _mm_mul_pd(ra0, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ra0 = _mm_load_pd(ptr_a + 2);
ra0 = _mm_mul_pd(ra0, rx0);
ry1 = _mm_add_pd(ry1, ra0);
ra0 = _mm_load_pd(ptr_a + 4);
ra0 = _mm_mul_pd(ra0, rx0);
ry2 = _mm_add_pd(ry2, ra0);
ra0 = _mm_load_pd(ptr_a + 6);
ra0 = _mm_mul_pd(ra0, rx0);
ry3 = _mm_add_pd(ry3, ra0);
//----2
rx0 = _mm_load_pd(ptr_x + 2);
ra0 = _mm_load_pd(ptr_a + 8);
ra0 = _mm_mul_pd(ra0, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ra0 = _mm_load_pd(ptr_a + 10);
ra0 = _mm_mul_pd(ra0, rx0);
ry1 = _mm_add_pd(ry1, ra0);
ra0 = _mm_load_pd(ptr_a + 12);
ra0 = _mm_mul_pd(ra0, rx0);
ry2 = _mm_add_pd(ry2, ra0);
ra0 = _mm_load_pd(ptr_a + 14);
ra0 = _mm_mul_pd(ra0, rx0);
ry3 = _mm_add_pd(ry3, ra0);
//----3
rx0 = _mm_load_pd(ptr_x + 4);
ra0 = _mm_load_pd(ptr_a + 16);
ra0 = _mm_mul_pd(ra0, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ra0 = _mm_mul_pd(ra0, rx0);
ry1 = _mm_add_pd(ry1, ra0);
ra0 = _mm_load_pd(ptr_a + 20);
ra0 = _mm_mul_pd(ra0, rx0);
ry2 = _mm_add_pd(ry2, ra0);
ra0 = _mm_load_pd(ptr_a + 22);
ra0 = _mm_mul_pd(ra0, rx0);
ry3 = _mm_add_pd(ry3, ra0);
//----4
rx0 = _mm_load_pd(ptr_x + 6);
ra0 = _mm_load_pd(ptr_a + 24);
ra0 = _mm_mul_pd(ra0, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ra0 = _mm_load_pd(ptr_a + 26);
ra0 = _mm_mul_pd(ra0, rx0);
ry1 = _mm_add_pd(ry1, ra0);
ra0 = _mm_load_pd(ptr_a + 28);
ra0 = _mm_mul_pd(ra0, rx0);
ry2 = _mm_add_pd(ry2, ra0);
ra0 = _mm_load_pd(ptr_a + 30);
ra0 = _mm_mul_pd(ra0, rx0);
ry3 = _mm_add_pd(ry3, ra0);
ptr_a += lb*nr;
ptr_x += lb;
}
_mm_store_pd(res0, ry0);
*ptr_y = res0[0] + res0[1];
_mm_store_pd(res1, ry1);
*(ptr_y + 1) = res1[0] + res1[1];
_mm_store_pd(res2, ry2);
*(ptr_y + 2)= res2[0] + res2[1];
_mm_store_pd(res3, ry3);
*(ptr_y + 3) = res3[0] + res3[1];
}
}
#include "stdafx.h"
#include <iostream>
#include <cmath>
#include "windows.h"
#include "mvec.h"
using namespace std;
int main(int argc, char* argv[])
{
double *a, *x, *y, *z;
int n;
DWORD tstart;
const int lb = 8;
double elaps_time;
cout << "Program Mat_Vect: performance y = y +A*x\n";
#ifdef _DEBUG
cout << "DEBUG version\n";
#else
cout << "RELEASE version\n";
#endif
cout << "Input dimension\n";
cin >> n;
n = n/lb;
n = lb*n;
try
{
a = new double [n*n];
x = new double [n+1];
y = new double [n];
z = new double [n];
}
catch(bad_alloc aa)
{
cout << "memory allocation error" << endl;
system("pause");
exit(1);
}
memset((void *)a, 0, _msize((void *)a));
memset((void *)x, 0, _msize((void *)x));
memset((void *)y, 0, _msize((void *)y));
cout << "start\n";
prepare(a, x, n);
//-------------------------naive algorithm-----------------------//
cout << "naive algorithm: \n";
tstart = GetTickCount();
mult_naive(a, x, z, n);
elaps_time = (double)(GetTickCount()-tstart)/1000.0;
cout << "naive algorithm: " << elaps_time << " sec" << endl;
//-------------------------algorithm which uses XMM registers-----------------------//
delete [] a;
delete [] x;
a = (double *)_aligned_malloc(n*n*sizeof(double), 16);
x = (double *)_aligned_malloc(n*sizeof(double), 16);
if(!a || !x)
{
cout << "memory allocation error" << endl;
system("pause");
exit(1);
}
cout << "algorithm which uses XMM: \n";
prepare(a, x, n);
tstart = GetTickCount();
matvec_XMM(a, x, y, n, lb);
elaps_time = (double)(GetTickCount()-tstart)/1000.0;
check(y, z, n);
cout << "algorithm which uses XMM: " << elaps_time << " sec" << endl;
delete [] y;
delete [] z;
_aligned_free(a);
_aligned_free(x);
system("pause");
return 0;
}
void check(double *y, double *z, int n)
{
int i;
for(i=0; i<n; i++)
{
if(fabs(z[i] - y[i]) > 1.0e-9)
{
cout << "error\n";
return;
}
}
cout << "OK\n";
}
void prepare(double *a, double *x, int n)
{
int i, j, ij;
for(i=0, ij=0; i<n; i++)
{
for(j=0; j<n; j++, ij++)
{
if(i == j)
a[ij] = 10.0;
else
a[ij] = (double)(i+1);
}
x[i] = 1.0;
}
}
我找到了解决办法
void matvec_XMM(double* a, double* x, double* y, int n, int lb)
{
int i, j;
memset((void *)y, 0, n*sizeof(double));
__declspec(align(16)) double res0[2];
__declspec(align(16)) double res1[2];
__declspec(align(16)) double res2[2];
__declspec(align(16)) double res3[2];
__m128d ry0, ry1, ry2, ry3, ra0, ra1, ra2, ra3, rx0;
double *ptr_a1, *ptr_a2, *ptr_a3, *ptr_a4, *ptr_x, *ptr_y;
const int nr = 4;
for (i = 0; i < n; i+=nr)
{
ry0 = _mm_setzero_pd();
ry1 = _mm_setzero_pd();
ry2 = _mm_setzero_pd();
ry3 = _mm_setzero_pd();
ptr_y = &y[i];
for (j = 0; j<n; j+=lb)
{
ptr_a1 = &a[i * n + j];
ptr_a2 = &a[(i + 1) * n + j];
ptr_a3 = &a[(i + 2) * n + j];
ptr_a4 = &a[(i + 3) * n + j];
ptr_x = &x[j];
_mm_prefetch((const char *)(ptr_a1 + lb), _MM_HINT_NTA);
_mm_prefetch((const char *)(ptr_a2 + lb), _MM_HINT_NTA);
_mm_prefetch((const char *)(ptr_a3 + lb), _MM_HINT_NTA);
_mm_prefetch((const char *)(ptr_a4 + lb), _MM_HINT_NTA);
_mm_prefetch((const char *)(ptr_x + lb), _MM_HINT_T0);
//-------------------------------------1
rx0 = _mm_load_pd(ptr_x);
ra0 = _mm_load_pd(ptr_a1);
ra1 = _mm_load_pd(ptr_a2);
ra2 = _mm_load_pd(ptr_a3);
ra3 = _mm_load_pd(ptr_a4);
ra0 = _mm_mul_pd(ra0, rx0);
ra1 = _mm_mul_pd(ra1, rx0);
ra2 = _mm_mul_pd(ra2, rx0);
ra3 = _mm_mul_pd(ra3, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ry1 = _mm_add_pd(ry1, ra1);
ry2 = _mm_add_pd(ry2, ra2);
ry3 = _mm_add_pd(ry3, ra3);
//------------------------------------2
rx0 = _mm_load_pd(ptr_x + 2);
ra0 = _mm_load_pd(ptr_a1 + 2);
ra1 = _mm_load_pd(ptr_a2 + 2);
ra2 = _mm_load_pd(ptr_a3 + 2);
ra3 = _mm_load_pd(ptr_a4 + 2);
ra0 = _mm_mul_pd(ra0, rx0);
ra1 = _mm_mul_pd(ra1, rx0);
ra2 = _mm_mul_pd(ra2, rx0);
ra3 = _mm_mul_pd(ra3, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ry1 = _mm_add_pd(ry1, ra1);
ry2 = _mm_add_pd(ry2, ra2);
ry3 = _mm_add_pd(ry3, ra3);
//-----------------------------------3
rx0 = _mm_load_pd(ptr_x + 4);
ra0 = _mm_load_pd(ptr_a1 + 4);
ra1 = _mm_load_pd(ptr_a2 + 4);
ra2 = _mm_load_pd(ptr_a3 + 4);
ra3 = _mm_load_pd(ptr_a4 + 4);
ra0 = _mm_mul_pd(ra0, rx0);
ra1 = _mm_mul_pd(ra1, rx0);
ra2 = _mm_mul_pd(ra2, rx0);
ra3 = _mm_mul_pd(ra3, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ry1 = _mm_add_pd(ry1, ra1);
ry2 = _mm_add_pd(ry2, ra2);
ry3 = _mm_add_pd(ry3, ra3);
//----------------------------------4
rx0 = _mm_load_pd(ptr_x + 6);
ra0 = _mm_load_pd(ptr_a1 + 6);
ra1 = _mm_load_pd(ptr_a2 + 6);
ra2 = _mm_load_pd(ptr_a3 + 6);
ra3 = _mm_load_pd(ptr_a4 + 6);
ra0 = _mm_mul_pd(ra0, rx0);
ra1 = _mm_mul_pd(ra1, rx0);
ra2 = _mm_mul_pd(ra2, rx0);
ra3 = _mm_mul_pd(ra3, rx0);
ry0 = _mm_add_pd(ry0, ra0);
ry1 = _mm_add_pd(ry1, ra1);
ry2 = _mm_add_pd(ry2, ra2);
ry3 = _mm_add_pd(ry3, ra3);
}
_mm_store_pd(res0, ry0);
*ptr_y = res0[0] + res0[1];
_mm_store_pd(res1, ry1);
*(ptr_y + 1) = res1[0] + res1[1];
_mm_store_pd(res2, ry2);
*(ptr_y + 2) = res2[0] + res2[1];
_mm_store_pd(res3, ry3);
*(ptr_y + 3) = res3[0] + res3[1];
}
}
主要内容:逐元素矩阵乘法,矩阵乘积运算,矩阵点积矩阵乘法是将两个矩阵作为输入值,并将 A 矩阵的行与 B 矩阵的列对应位置相乘再相加,从而生成一个新矩阵,如下图所示: 注意:必须确保第一个矩阵中的行数等于第二个矩阵中的列数,否则不能进行矩阵乘法运算。 图1:矩阵乘法 矩阵乘法运算被称为向量化操作,向量化的主要目的是减少使用的 for 循环次数或者根本不使用。这样做的目的是为了加速程序的计算。 下面介绍 NumPy 提供的三种矩阵乘法,从而进一步
问题内容: 在numpy中,我有N个3x3矩阵的数组。这将是我如何存储它们的示例(我正在提取内容): 我也有一个由3个向量组成的数组,这将是一个示例: 我似乎无法弄清楚如何通过numpy将它们相乘,从而实现如下效果: 与的形状(在投射到阵列)是。但是,由于速度的原因,列表实现是不可能的。 我尝试了各种换位的np.dot,但最终结果没有得到正确的形状。 问题答案: 使用 脚步 : 1)保持第一根轴对
考虑两个矩阵A和B.如果A是mxn矩阵而B是nxp矩阵,它们可以相乘以产生mxn矩阵C.只有当A中的列数n等于数量时才可以进行矩阵乘法在B.中的行n 在矩阵乘法中,第一矩阵中的行的元素与第二矩阵中的对应列相乘。 在得到的矩阵C中的第 (i,j)位置中的每个元素是第i行的第i行中的元素与第二矩阵的第 j列中的对应元素的乘积的总和。 MATLAB中的矩阵乘法是使用*运算符执行的。 例子 (Exampl
我试图乘以两个块对称矩阵(矩阵大小矩阵大小)。我想执行块矩阵乘法(将一个矩阵分成多个块大小矩阵,并将相应的块相乘)。我已经写了一些代码,但想改进它,并存储主对角线以上的块,但我没有任何想法。如果可能的话,你们能帮忙吗?
注:我也在这里的Eigen论坛上发表了这篇文章 我想用一个3x3矩阵预乘3xN个矩阵,即,变换3D点,如p_dest=T*p_source 初始化矩阵后: 而且 进行NT重复只是为了计算平均时间 我很惊讶逐列乘法比直接乘法快4/5倍(如果我不使用,直接乘法甚至更慢,但这没有问题,因为它是临时复制)。我尝试将NUMCOLS从0改为1000000,关系是线性的。
C++:15秒(源) Python:6分13秒(来源) C++:45分钟(源) 蟒蛇:10小时后被杀死(来源) 为什么Strassen矩阵乘法比标准矩阵乘法慢得多? null null null