sse矩阵乘法 应该是1毫秒纯运算1000次

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sse矩阵乘法 应该是1毫秒纯运算1000次相关的知识,希望对你有一定的参考价值。

#include <intrin.h>
#include <math.h>

struct Vector4
{    
    float x, y, z, w;        
};

struct Matrix
{
    float _M[4][4];

public:
    
    //单位化
    void Identity()
    {
        ZeroMemory((void*)_M,sizeof(float)*16);
        _M[0][0] = 1.0f;
        _M[1][1] = 1.0f;
        _M[2][2] = 1.0f;
        _M[3][3] = 1.0f;
    }

    //转置矩阵
    Matrix Transpose()
    {    
        Matrix ret;
        ret._M[0][0] = _M[0][0];
        ret._M[1][0] = _M[0][1];
        ret._M[2][0] = _M[0][2];
        ret._M[3][0] = _M[0][3];

        ret._M[0][1] = _M[1][0];
        ret._M[1][1] = _M[1][1];
        ret._M[2][1] = _M[1][2];
        ret._M[3][1] = _M[1][3];

        ret._M[0][2] = _M[2][0];
        ret._M[1][2] = _M[2][1];
        ret._M[2][2] = _M[2][2];
        ret._M[3][2] = _M[2][3];

        ret._M[0][3] = _M[3][0];
        ret._M[1][3] = _M[3][1];
        ret._M[2][3] = _M[3][2];
        ret._M[3][3] = _M[3][3];
        return ret;
    }
};


float Sum(const Vector4 & arg1)
{

    return arg1.x + arg1.y + arg1.z + arg1.w;
    
}

Vector4 Normaliz(Vector4 v1)
{
    Vector4 v2;
    float k = 1/sqrtf(v1.x * v1.x + v1.y * v1.y + v1.z * v1.z);
    v2.x = v1.x * k;
    v2.y = v1.y * k;
    v2.z = v1.z * k;
    return v2;
}

//向量相加
Vector4 SSE_VectiorAdd ( const Vector4 &Op_A, const Vector4 &Op_B )
{
    Vector4 Ret_Vector;

    __asm 
    {        
        MOV EAX, Op_A                              // Load pointers into CPU regs
        MOV EBX, Op_B

        MOVUPS XMM0, [EAX]                 // Move unaligned vectors to SSE regs
        MOVUPS XMM1, [EBX]

        ADDPS XMM0, XMM1                   // v1 + v2
        MOVUPS [Ret_Vector], XMM0      // Save the return vector
    }

    return Ret_Vector;
}

//向量叉乘
Vector4 SSE_VectorCross(const Vector4 &Op_A, const Vector4 &Op_B)
{
    Vector4 Ret_Vector;
    __asm 
    {
        MOV EAX, Op_A                               // Load pointers into CPU regs
        MOV EBX, Op_B

        MOVUPS XMM0, [EAX]                 // Move unaligned vectors to SSE regs
        MOVUPS XMM1, [EBX]    
        MOVAPS XMM2, XMM0               // Make a copy of vector A
        MOVAPS XMM3, XMM1               // Make a copy of vector B

        SHUFPS XMM0, XMM0, 0xD8      // 11 01 10 00  Flip the middle elements of A
        SHUFPS XMM1, XMM1, 0xE1       // 11 10 00 01 Flip first two elements of B
        MULPS  XMM0, XMM1                 // Multiply the modified register vectors

        SHUFPS XMM2, XMM2, 0xE1      // 11 10 00 01 Flip first two elements of the A copy
        SHUFPS XMM3, XMM3, 0xD8     // 11 01 10 00  Flip the middle elements of the B copy
        MULPS XMM2, XMM3                 // Multiply the modified register vectors

        SUBPS  XMM0, XMM2                  // Subtract the two resulting register vectors

        MOVUPS [Ret_Vector], XMM0      // Save the return vector
    }
    return Ret_Vector;
}

//向量缩放
Vector4 SSE_VectorScale(const Vector4 &Op_A, const float &Op_B)
{
    Vector4 Ret_Vector;

    __m128 F = _mm_set1_ps(Op_B);             // Create a 128 bit vector with four elements Op_B

    __asm 
    {        
        MOV EAX, Op_A                           // Load pointer into CPU reg
        MOVUPS XMM0, [EAX]                   // Move the vector  to an SSE reg       
        MULPS XMM0, F                               // Multiply vectors
        MOVUPS [Ret_Vector], XMM0       // Save the return vector
    }
    return Ret_Vector;
}


void SSE_VectorDot(const Vector4 &Op_A, const Vector4 &Op_B,float& ret)
{
    Vector4 v1;
    
    //__m128 F = _mm_set1_ps(Op_B);             // Create a 128 bit vector with four elements Op_B

    __asm 
    {        
        MOV EAX, Op_A                           // Load pointer into CPU reg
        MOV EBX, Op_B
        MOVUPS XMM1, [EBX]
        MOVUPS XMM0, [EAX]                   // Move the vector  to an SSE reg       
        MULPS XMM0,XMM1                                // Multiply vectors
        MOVUPS [v1], XMM0       // Save the return vector
    }

    ret = v1.x + v1.y + v1.z;

}

void SSE_VectorMultiplyMatrix(const Vector4& v,const Matrix& m1,Vector4& ret)
{
    Vector4 va,vb,vc,vd;
    Vector4 *pva,*pvb,*pvc,*pvd;
    const Vector4 *pv;
    
    //取出矩阵每一列
    va.x = m1._M[0][0];
    va.y = m1._M[1][0];
    va.z = m1._M[2][0];
    va.w = m1._M[3][0];
    
    vb.x = m1._M[0][1];
    vb.y = m1._M[1][1];
    vb.z = m1._M[2][1];
    vb.w = m1._M[3][1];

    vc.x = m1._M[0][2];
    vc.y = m1._M[1][2];
    vc.z = m1._M[2][2];
    vc.w = m1._M[3][2];

    vd.x = m1._M[0][3];
    vd.y = m1._M[1][3];
    vd.z = m1._M[2][3];
    vd.w = m1._M[3][3];
    
    pva = &va;
    pvb = &vb;
    pvc = &vc;
    pvd = &vd;
    pv = &v;
    __asm 
    {   
        //矩阵四列放入mmx0-mmx3
        MOV EAX, pva                           // Load pointer into CPU reg
        MOVUPS XMM0, [EAX]
        MOV EAX, pvb                           // Load pointer into CPU reg
        MOVUPS XMM1, [EAX]
        MOV EAX, pvc                           // Load pointer into CPU reg
        MOVUPS XMM2, [EAX]
        MOV EAX, pvd                           // Load pointer into CPU reg
        MOVUPS XMM3, [EAX]
        
        //向量放入 mmx4
        MOV EAX, pv 
        MOVUPS XMM4, [EAX]

        //向量点乘矩阵每列
        MULPS XMM0,XMM4
        MULPS XMM1,XMM4  
        MULPS XMM2,XMM4  
        MULPS XMM3,XMM4 
        
        //输出四个分量
        MOVUPS [va], XMM0
        MOVUPS [vb], XMM1
        MOVUPS [vc], XMM2
        MOVUPS [vd], XMM3
    }

    //四个分量求和得变换后向量
    ret.x = va.w + va.x + va.y + va.z;
    ret.y = vb.w + vb.x + vb.y + vb.z;
    ret.z = vc.w + vc.x + vc.y + vc.z;
    ret.w = vd.w + vd.x + vd.y + vd.z;

}

void SSE_MatrixMultiplyMatrix(const Matrix& arg1,const Matrix& arg2,Matrix & ret)
{
    Matrix m1,m2;
    Vector4 v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
    Vector4 *pv1,*pv2,*pv3,*pv4,*pv5,*pv6,*pv7,*pv8;
    m1 = arg1;
    m2 = arg2;
    m2 = m2.Transpose();

    pv1 = (Vector4*)&m1._M[0][0];
    pv2 = (Vector4*)&m1._M[1][0];
    pv3 = (Vector4*)&m1._M[2][0];
    pv4 = (Vector4*)&m1._M[3][0];

    pv5 = (Vector4*)&m2._M[0][0];
    pv6 = (Vector4*)&m2._M[1][0];
    pv7 = (Vector4*)&m2._M[2][0];
    pv8 = (Vector4*)&m2._M[3][0];

    __asm
    { 
        MOV EAX, pv5                          
        MOV EBX, pv6 
        MOV ECX, pv7                             
        MOV EDX, pv8
        MOVUPS XMM1, [EAX]              
        MOVUPS XMM2, [EBX]
        MOVUPS XMM3, [ECX]
        MOVUPS XMM4, [EDX]
        MOV EAX, pv1
        MOVUPS XMM0, [EAX]

        MULPS XMM1, XMM0
        MOVUPS[v1], XMM1
        MULPS XMM2, XMM0
        MOVUPS[v2], XMM2
        MULPS XMM3, XMM0
        MOVUPS[v3], XMM3
        MULPS XMM4, XMM0
        MOVUPS[v4], XMM4


        MOV EAX, pv5
        MOV EBX, pv6
        MOV ECX, pv7
        MOV EDX, pv8
        MOVUPS XMM1, [EAX]
        MOVUPS XMM2, [EBX]
        MOVUPS XMM3, [ECX]
        MOVUPS XMM4, [EDX]
        MOV EAX, pv2
        MOVUPS XMM0, [EAX]
        MULPS XMM1, XMM0
        MOVUPS[v5], XMM1
        MULPS XMM2, XMM0
        MOVUPS[v6], XMM2
        MULPS XMM3, XMM0
        MOVUPS[v7], XMM3
        MULPS XMM4, XMM0
        MOVUPS[v8], XMM4


        MOV EAX, pv5
        MOV EBX, pv6
        MOV ECX, pv7
        MOV EDX, pv8
        MOVUPS XMM1, [EAX]
        MOVUPS XMM2, [EBX]
        MOVUPS XMM3, [ECX]
        MOVUPS XMM4, [EDX]
        MOV EAX, pv3
        MOVUPS XMM0, [EAX]
        MULPS XMM1, XMM0
        MOVUPS[v9], XMM1
        MULPS XMM2, XMM0
        MOVUPS[v10], XMM2
        MULPS XMM3, XMM0
        MOVUPS[v11], XMM3
        MULPS XMM4, XMM0
        MOVUPS[v12], XMM4


        MOV EAX, pv5
        MOV EBX, pv6
        MOV ECX, pv7
        MOV EDX, pv8
        MOVUPS XMM1, [EAX]
        MOVUPS XMM2, [EBX]
        MOVUPS XMM3, [ECX]
        MOVUPS XMM4, [EDX]
        MOV EAX, pv4
        MOVUPS XMM0, [EAX]
        MULPS XMM1, XMM0
        MOVUPS[v13], XMM1
        MULPS XMM2, XMM0
        MOVUPS[v14], XMM2
        MULPS XMM3, XMM0
        MOVUPS[v15], XMM3
        MULPS XMM4, XMM0
        MOVUPS[v16], XMM4
    }

    ret._M[0][0] = Sum(v1);
    ret._M[0][1] = Sum(v2);
    ret._M[0][2] = Sum(v3);
    ret._M[0][3] = Sum(v4);

    ret._M[1][0] = Sum(v5);
    ret._M[1][1] = Sum(v6);
    ret._M[1][2] = Sum(v7);
    ret._M[1][3] = Sum(v8);

    ret._M[2][0] = Sum(v9);
    ret._M[2][1] = Sum(v10);
    ret._M[2][2] = Sum(v11);
    ret._M[2][3] = Sum(v12);

    ret._M[3][0] = Sum(v13);
    ret._M[3][1] = Sum(v14);
    ret._M[3][2] = Sum(v15);
    ret._M[3][3] = Sum(v16);

    return;
}
 
 
 

 

#include <Windows.h>
#include <iostream>
#include <math.h>
#include "sse.h"

using namespace std;



void main(int argc, char **argv)
{

    Matrix m1,m2,mRet;
    m1.Identity();
    m2.Identity();
    float time;
    time = GetTickCount();
    
    for (int t = 0; t < 100000000; t++)
    {
        SSE_MatrixMultiplyMatrix(m1,m2,mRet);
    }

    time = GetTickCount() - time;
    _asm {
    int 3
    }
    return;
}

 

以上是关于sse矩阵乘法 应该是1毫秒纯运算1000次的主要内容,如果未能解决你的问题,请参考以下文章

如何进一步优化矩阵乘法的性能?

使用 SSE 错误 __m128 到 *float 转换的矩阵乘法?

快速幂矩阵

数组乘法与 sse 内在函数乘法的时序?

sse2浮点乘法

使用矢量化 C++ 的矩阵乘法