当前位置: 首页 > 知识库问答 >
问题:

C矩阵乘法

松灿
2023-03-14

我想使用寄存器(逐行信息)通过向量算法创建矩阵乘法。打开外循环4次我有空洞matvec_XMM(双* a,双* x,双* y,整数n,整数磅)函数的问题,它返回了不好的结果,这是算法wchich我必须使用:

i = 1,n,4
   r0 = r1 = r2 = r3 = 0
   j = 1,n,8
      r0 = r0 + aij * xj + ai,j+1 * xj+1 + … + ai,j+7 * xj+7
      r1 = r1 + ai+1,j * xj + ai+1,j+1 * xj+1 + … + ai+1,j+7 * xj+7
      r2 = r2 + ai+2,j * xj + ai+2,j+1 * xj+1 + … + ai+2,j+7 * xj+7
      r3 = r3 + ai+3,j * xj + ai+3,j+1 * xj+1 + … + ai+3,j+7 * xj+7
   end j
   yi = r0; yi+1 = r1; yi+2 = r2; yi+3 = r3; 
end i

它是ma代码

#include "stdafx.h"
#include <iostream>
#include "mvec.h"
#include <emmintrin.h>

using namespace std;

void mult_naive(double *a, double *x, double *y, int n)
{
    int i, j, ij;
    double register reg;

    for(i=0, ij=0; i<n; ++i)
    {
        reg = 0;

        for(j=0; j<n; ++j, ++ij)
        {
            reg += a[ij]*x[j];
        }

        y[i] = reg;
    }
}

void matvec_XMM(double* a, double* x, double* y, int n, int lb)
{
int i, j;

memset((void *)y, 0, n*sizeof(double));
double res0[2];
double res1[2];
double res2[2];
double res3[2];

__m128d ry0, ry1, ry2, ry3, ra0, rx0;
double *ptr_a, *ptr_x, *ptr_y;
const int nr = 4;

ptr_a = a;

for (i = 0; i < n; i+=nr)   
{
    ry0 = _mm_setzero_pd();
    ry1 = _mm_setzero_pd();
    ry2 = _mm_setzero_pd();
    ry3 = _mm_setzero_pd(); 

    ptr_y = &y[i];
    ptr_x = x;

    for (j = 0; j<n; j+=lb)
    {

        _mm_prefetch((const char *)(ptr_a + lb*nr), _MM_HINT_NTA); 
        _mm_prefetch((const char *)(ptr_x + lb), _MM_HINT_T0);

        //----1
        rx0 = _mm_load_pd(ptr_x);       
        ra0 = _mm_load_pd(ptr_a);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry0 = _mm_add_pd(ry0, ra0);

        ra0 = _mm_load_pd(ptr_a + 2); 
        ra0 = _mm_mul_pd(ra0, rx0);
        ry1 = _mm_add_pd(ry1, ra0);

        ra0 = _mm_load_pd(ptr_a + 4);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry2 = _mm_add_pd(ry2, ra0);

        ra0 = _mm_load_pd(ptr_a + 6);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry3 = _mm_add_pd(ry3, ra0);

            //----2
        rx0 = _mm_load_pd(ptr_x + 2);   
        ra0 = _mm_load_pd(ptr_a + 8);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry0 = _mm_add_pd(ry0, ra0);

        ra0 = _mm_load_pd(ptr_a + 10);  
        ra0 = _mm_mul_pd(ra0, rx0);
        ry1 = _mm_add_pd(ry1, ra0);

        ra0 = _mm_load_pd(ptr_a + 12);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry2 = _mm_add_pd(ry2, ra0);

        ra0 = _mm_load_pd(ptr_a + 14);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry3 = _mm_add_pd(ry3, ra0);

        //----3
        rx0 = _mm_load_pd(ptr_x + 4);       
        ra0 = _mm_load_pd(ptr_a + 16);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry0 = _mm_add_pd(ry0, ra0);

        ra0 = _mm_mul_pd(ra0, rx0);
        ry1 = _mm_add_pd(ry1, ra0);

        ra0 = _mm_load_pd(ptr_a + 20);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry2 = _mm_add_pd(ry2, ra0);

        ra0 = _mm_load_pd(ptr_a + 22);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry3 = _mm_add_pd(ry3, ra0);

        //----4
        rx0 = _mm_load_pd(ptr_x + 6);       
        ra0 = _mm_load_pd(ptr_a + 24);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry0 = _mm_add_pd(ry0, ra0);

        ra0 = _mm_load_pd(ptr_a + 26);  
        ra0 = _mm_mul_pd(ra0, rx0);
        ry1 = _mm_add_pd(ry1, ra0);

        ra0 = _mm_load_pd(ptr_a + 28);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry2 = _mm_add_pd(ry2, ra0);

        ra0 = _mm_load_pd(ptr_a + 30);
        ra0 = _mm_mul_pd(ra0, rx0);
        ry3 = _mm_add_pd(ry3, ra0);

        ptr_a += lb*nr;
        ptr_x += lb;
    }

    _mm_store_pd(res0, ry0);
    *ptr_y = res0[0] + res0[1];

    _mm_store_pd(res1, ry1);
    *(ptr_y + 1) = res1[0] + res1[1];

    _mm_store_pd(res2, ry2);
    *(ptr_y + 2)= res2[0] + res2[1];

    _mm_store_pd(res3, ry3);
    *(ptr_y + 3) = res3[0] + res3[1];

}
}



#include "stdafx.h"
#include <iostream>
#include <cmath>
#include "windows.h"
#include "mvec.h"

using namespace std;


int main(int argc, char* argv[])
{
    double *a, *x, *y, *z;
    int n;
    DWORD tstart;
    const int lb = 8;
    double elaps_time;
    cout << "Program Mat_Vect: performance y = y +A*x\n";

#ifdef _DEBUG
    cout << "DEBUG version\n";
#else
    cout << "RELEASE version\n";
#endif

    cout << "Input dimension\n";
    cin >> n;

    n = n/lb;
    n = lb*n;

    try
    {
        a = new double [n*n];
        x = new double [n+1];
        y = new double [n];
        z = new double [n];
    }
    catch(bad_alloc aa)
    {
        cout << "memory allocation error" << endl;
        system("pause");
        exit(1);
    }

    memset((void *)a, 0, _msize((void *)a));
    memset((void *)x, 0, _msize((void *)x));
    memset((void *)y, 0, _msize((void *)y));

    cout << "start\n";

    prepare(a, x, n);

    //-------------------------naive algorithm-----------------------//
    cout << "naive algorithm: \n";
    tstart = GetTickCount();
    mult_naive(a, x, z, n);
    elaps_time = (double)(GetTickCount()-tstart)/1000.0;
    cout << "naive algorithm: " << elaps_time << " sec" << endl;

    //-------------------------algorithm which uses XMM registers-----------------------//
    delete [] a;
    delete [] x;
    a = (double *)_aligned_malloc(n*n*sizeof(double), 16);
    x = (double *)_aligned_malloc(n*sizeof(double), 16);
    if(!a || !x)
    {
        cout << "memory allocation error" << endl;
        system("pause");
        exit(1);
    }
    cout << "algorithm which uses XMM: \n";
    prepare(a, x, n);
    tstart = GetTickCount();
    matvec_XMM(a, x, y, n, lb);
    elaps_time = (double)(GetTickCount()-tstart)/1000.0;
    check(y, z, n);
    cout << "algorithm which uses XMM: " << elaps_time << " sec" << endl;

    delete [] y;
    delete [] z;
    _aligned_free(a);
    _aligned_free(x);

    system("pause");
    return 0;
}


void check(double *y, double *z, int n)
{
    int i;
    for(i=0; i<n; i++)
    {
        if(fabs(z[i] - y[i]) > 1.0e-9)
        {
            cout << "error\n";
            return;
        }
    }

    cout << "OK\n";
}

void prepare(double *a, double *x, int n)
{
    int i, j, ij;

    for(i=0, ij=0; i<n; i++)
    {
        for(j=0; j<n; j++, ij++)
        {
            if(i == j)
                a[ij] = 10.0;
            else
                a[ij] = (double)(i+1);
        }

        x[i] = 1.0;
    }
}

共有1个答案

皇甫鸿远
2023-03-14

我找到了解决办法

void matvec_XMM(double* a, double* x, double* y, int n, int lb)
{
    int i, j;

    memset((void *)y, 0, n*sizeof(double));
    __declspec(align(16)) double res0[2];
    __declspec(align(16)) double res1[2];
    __declspec(align(16)) double res2[2];
    __declspec(align(16)) double res3[2];

    __m128d ry0, ry1, ry2, ry3, ra0, ra1, ra2, ra3, rx0;
    double *ptr_a1, *ptr_a2, *ptr_a3, *ptr_a4, *ptr_x, *ptr_y;
    const int nr = 4;

    for (i = 0; i < n; i+=nr)
    {
        ry0 = _mm_setzero_pd();
        ry1 = _mm_setzero_pd();
        ry2 = _mm_setzero_pd();
        ry3 = _mm_setzero_pd();

        ptr_y = &y[i];

        for (j = 0; j<n; j+=lb)
        {
            ptr_a1 = &a[i * n + j];
            ptr_a2 = &a[(i + 1) * n + j];
            ptr_a3 = &a[(i + 2) * n + j];
            ptr_a4 = &a[(i + 3) * n + j];
            ptr_x = &x[j];

            _mm_prefetch((const char *)(ptr_a1 + lb), _MM_HINT_NTA);
            _mm_prefetch((const char *)(ptr_a2 + lb), _MM_HINT_NTA);
            _mm_prefetch((const char *)(ptr_a3 + lb), _MM_HINT_NTA);
            _mm_prefetch((const char *)(ptr_a4 + lb), _MM_HINT_NTA);
            _mm_prefetch((const char *)(ptr_x + lb), _MM_HINT_T0);

            //-------------------------------------1
            rx0 = _mm_load_pd(ptr_x);

            ra0 = _mm_load_pd(ptr_a1);
            ra1 = _mm_load_pd(ptr_a2);
            ra2 = _mm_load_pd(ptr_a3);
            ra3 = _mm_load_pd(ptr_a4);

            ra0 = _mm_mul_pd(ra0, rx0);
            ra1 = _mm_mul_pd(ra1, rx0);
            ra2 = _mm_mul_pd(ra2, rx0);
            ra3 = _mm_mul_pd(ra3, rx0);

            ry0 = _mm_add_pd(ry0, ra0);
            ry1 = _mm_add_pd(ry1, ra1);
            ry2 = _mm_add_pd(ry2, ra2);
            ry3 = _mm_add_pd(ry3, ra3);

            //------------------------------------2
            rx0 = _mm_load_pd(ptr_x + 2);

            ra0 = _mm_load_pd(ptr_a1 + 2);
            ra1 = _mm_load_pd(ptr_a2 + 2);
            ra2 = _mm_load_pd(ptr_a3 + 2);
            ra3 = _mm_load_pd(ptr_a4 + 2);

            ra0 = _mm_mul_pd(ra0, rx0);
            ra1 = _mm_mul_pd(ra1, rx0);
            ra2 = _mm_mul_pd(ra2, rx0);
            ra3 = _mm_mul_pd(ra3, rx0);

            ry0 = _mm_add_pd(ry0, ra0);
            ry1 = _mm_add_pd(ry1, ra1);
            ry2 = _mm_add_pd(ry2, ra2);
            ry3 = _mm_add_pd(ry3, ra3);

            //-----------------------------------3
            rx0 = _mm_load_pd(ptr_x + 4);

            ra0 = _mm_load_pd(ptr_a1 + 4);
            ra1 = _mm_load_pd(ptr_a2 + 4);
            ra2 = _mm_load_pd(ptr_a3 + 4);
            ra3 = _mm_load_pd(ptr_a4 + 4);

            ra0 = _mm_mul_pd(ra0, rx0);
            ra1 = _mm_mul_pd(ra1, rx0);
            ra2 = _mm_mul_pd(ra2, rx0);
            ra3 = _mm_mul_pd(ra3, rx0);

            ry0 = _mm_add_pd(ry0, ra0);
            ry1 = _mm_add_pd(ry1, ra1);
            ry2 = _mm_add_pd(ry2, ra2);
            ry3 = _mm_add_pd(ry3, ra3);

            //----------------------------------4
            rx0 = _mm_load_pd(ptr_x + 6);

            ra0 = _mm_load_pd(ptr_a1 + 6);
            ra1 = _mm_load_pd(ptr_a2 + 6);
            ra2 = _mm_load_pd(ptr_a3 + 6);
            ra3 = _mm_load_pd(ptr_a4 + 6);

            ra0 = _mm_mul_pd(ra0, rx0);
            ra1 = _mm_mul_pd(ra1, rx0);
            ra2 = _mm_mul_pd(ra2, rx0);
            ra3 = _mm_mul_pd(ra3, rx0);

            ry0 = _mm_add_pd(ry0, ra0);
            ry1 = _mm_add_pd(ry1, ra1);
            ry2 = _mm_add_pd(ry2, ra2);
            ry3 = _mm_add_pd(ry3, ra3);
        }

        _mm_store_pd(res0, ry0);
        *ptr_y = res0[0] + res0[1];

        _mm_store_pd(res1, ry1);
        *(ptr_y + 1) = res1[0] + res1[1];

        _mm_store_pd(res2, ry2);
        *(ptr_y + 2) = res2[0] + res2[1];

        _mm_store_pd(res3, ry3);
        *(ptr_y + 3) = res3[0] + res3[1];
    }
}
 类似资料:
  • 主要内容:逐元素矩阵乘法,矩阵乘积运算,矩阵点积矩阵乘法是将两个矩阵作为输入值,并将 A 矩阵的行与 B 矩阵的列对应位置相乘再相加,从而生成一个新矩阵,如下图所示: 注意:必须确保第一个矩阵中的行数等于第二个矩阵中的列数,否则不能进行矩阵乘法运算。 图1:矩阵乘法 矩阵乘法运算被称为向量化操作,向量化的主要目的是减少使用的 for 循环次数或者根本不使用。这样做的目的是为了加速程序的计算。 下面介绍 NumPy 提供的三种矩阵乘法,从而进一步

  • 问题内容: 在numpy中,我有N个3x3矩阵的数组。这将是我如何存储它们的示例(我正在提取内容): 我也有一个由3个向量组成的数组,这将是一个示例: 我似乎无法弄清楚如何通过numpy将它们相乘,从而实现如下效果: 与的形状(在投射到阵列)是。但是,由于速度的原因,列表实现是不可能的。 我尝试了各种换位的np.dot,但最终结果没有得到正确的形状。 问题答案: 使用 脚步 : 1)保持第一根轴对

  • 考虑两个矩阵A和B.如果A是mxn矩阵而B是nxp矩阵,它们可以相乘以产生mxn矩阵C.只有当A中的列数n等于数量时才可以进行矩阵乘法在B.中的行n 在矩阵乘法中,第一矩阵中的行的元素与第二矩阵中的对应列相乘。 在得到的矩阵C中的第 (i,j)位置中的每个元素是第i行的第i行中的元素与第二矩阵的第 j列中的对应元素的乘积的总和。 MATLAB中的矩阵乘法是使用*运算符执行的。 例子 (Exampl

  • 我试图乘以两个块对称矩阵(矩阵大小矩阵大小)。我想执行块矩阵乘法(将一个矩阵分成多个块大小矩阵,并将相应的块相乘)。我已经写了一些代码,但想改进它,并存储主对角线以上的块,但我没有任何想法。如果可能的话,你们能帮忙吗?

  • 注:我也在这里的Eigen论坛上发表了这篇文章 我想用一个3x3矩阵预乘3xN个矩阵,即,变换3D点,如p_dest=T*p_source 初始化矩阵后: 而且 进行NT重复只是为了计算平均时间 我很惊讶逐列乘法比直接乘法快4/5倍(如果我不使用,直接乘法甚至更慢,但这没有问题,因为它是临时复制)。我尝试将NUMCOLS从0改为1000000,关系是线性的。

  • C++:15秒(源) Python:6分13秒(来源) C++:45分钟(源) 蟒蛇:10小时后被杀死(来源) 为什么Strassen矩阵乘法比标准矩阵乘法慢得多? null null null