矩阵乘法
Posted 爨爨爨好
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了矩阵乘法相关的知识,希望对你有一定的参考价值。
▶ 计算矩阵矩阵乘法 Am×n Bn×p == Cm×p 过程。
▶ 原始矩阵乘法,一个线程计算结果矩阵中的一个元素。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <malloc.h> 4 #include <time.h> 5 #include "cuda_runtime.h" 6 #include "device_launch_parameters.h" 7 8 #define M (1024 + 13) 9 #define N (1024 + 31) 10 #define P (1024 + 7) 11 #define THREAD 16 12 #define SEED (unsigned int)clock() 13 #define CEIL(x,y) (( x - 1 ) / y + 1) 14 15 void checkNULL(void *input) 16 { 17 if (input == NULL) 18 { 19 printf("\\n\\tfind a NULL!"); 20 exit(1); 21 } 22 return; 23 } 24 25 void checkCudaError(cudaError input) 26 { 27 if (input != cudaSuccess) 28 { 29 printf("\\n\\tfind a cudaError!"); 30 exit(1); 31 } 32 return; 33 } 34 35 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(np) 36 { 37 int k; 38 float temp; 39 int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号 40 int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号 41 42 for (; idy < M; idy += gridDim.y*blockDim.y) 43 { 44 for (; idx < P; idx += gridDim.x*blockDim.x) 45 { 46 for (k = 0, temp = 0.0f; k < N; k++) 47 temp += a[idy * N + k] * b[k * P + idx]; 48 c[idy * P + idx] = temp; 49 } 50 } 51 return; 52 } 53 54 int main() 55 { 56 int i, j, k, flag; 57 float *a, *b, *c, *dev_a, *dev_b, *dev_c, temp, elapsedTime; 58 clock_t time, cputime; 59 cudaEvent_t start, stop; 60 cudaEventCreate(&start); 61 cudaEventCreate(&stop); 62 63 printf("\\n\\tMatrix size: A(%d, %d) × B(%d, %d) == C(%d, %d)", M, N, N, P, M, P); 64 printf("\\n\\tStart!"); 65 66 a = (float *)malloc(sizeof(float) * M * N); 67 b = (float *)malloc(sizeof(float) * N * P); 68 c = (float *)malloc(sizeof(float) * M * P); 69 checkNULL(a); 70 checkNULL(b); 71 checkNULL(c); 72 checkCudaError(cudaMalloc((float **)&dev_a, sizeof(float) * M * N)); 73 checkCudaError(cudaMalloc((float **)&dev_b, sizeof(float) * N * P)); 74 checkCudaError(cudaMalloc((float **)&dev_c, sizeof(float) * M * P)); 75 76 time = clock(); 77 srand(SEED); 78 for (i = 0; i < M * N; a[i] = (float)rand() / RAND_MAX, i++); 79 for (i = 0; i < N * P; b[i] = (float)rand() / RAND_MAX, i++); 80 printf("\\n\\tInitialized! Time:\\t%6.2f ms", (float)(clock() - time)); 81 82 // 用CPU计算 83 time = clock(); 84 for (i = 0; i < M; i++) 85 { 86 for (j = 0; j < P; j++) 87 { 88 for (k = 0, temp = 0.0f; k < N; k++) 89 c[i * P + j] += a[i * N + k] * b[k * P + j]; 90 } 91 } 92 cputime = clock() - time; 93 printf("\\n\\tCPU cost time:\\t%6.2f ms", (float)cputime); 94 95 // 用GPU计算 96 time = clock(); 97 cudaEventRecord(start, 0); 98 99 cudaMemcpy(dev_a, a, sizeof(float) * M * N, cudaMemcpyHostToDevice); 100 cudaMemcpy(dev_b, b, sizeof(float) * N * P, cudaMemcpyHostToDevice); 101 102 times << < dim3(CEIL(P, THREAD), CEIL(M, THREAD), 1), dim3(THREAD, THREAD) >> > (dev_a, dev_b, dev_c); 103 104 cudaMemcpy(c, dev_c, sizeof(float) * M * P, cudaMemcpyDeviceToHost); 105 cudaDeviceSynchronize(); 106 107 time = clock() - time; 108 cudaEventRecord(stop, 0); 109 110 cudaEventSynchronize(stop); 111 cudaEventElapsedTime(&elapsedTime, start, stop); 112 printf("\\n\\tGPU cost time by CPU API:\\t%6.2f ms\\n\\tGPU cost time by GPU API:\\t%6.2f ms\\n\\n", (float)time, elapsedTime); 113 printf("\\n\\tAccRatio =\\t%3.1f\\n\\n", (float)cputime / time); 114 115 // 检验结果(第一次CPU计算的结果被覆盖掉了,这里相当于再算一次,逐个检查) 116 for (i = 0, flag = 1; i < M; i++) 117 { 118 for (j = 0; j < P; j++) 119 { 120 for (k = 0, temp = 0.0f; k < N; k++) 121 temp += a[i * N + k] * b[k * P + j]; 122 if (temp != c[i * P + j]) 123 { 124 printf("\\n\\tCalculate error at (%d,%d),\\n\\t\\tc[i] == %f,CPU == %f", i, j, c[i * P + j], temp); 125 flag = 0; 126 } 127 } 128 } 129 if (flag == 1) 130 printf("\\n\\tCalculate correctly!"); 131 132 cudaEventDestroy(start); 133 cudaEventDestroy(stop); 134 cudaFree(dev_a); 135 cudaFree(dev_b); 136 cudaFree(dev_c); 137 getchar(); 138 return 0; 139 }
● 输出结果
Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031) Start! Initialized! Time: 127.00 ms CPU cost time: 4049.00 ms GPU cost time by CPU API: 103.00 ms GPU cost time by GPU API: 103.02 ms AccRatio = 39.3 Calculate correctly!
● 矩阵初始化消耗的时间,在计算超大型矩阵的过程中初始化需要占用时间(因为矩阵元素是在CPU中逐个初始化的)。
● 比较了GPU运算过程的CPU和GPU计时的结果,在如上代码中(在内存拷贝前开始计时,在内存回拷完成后结束计时),两种计时方式差距很小,基本上认为结果相同。
● 下方为相同的矩阵乘法在CPU中的计算耗时,可见目前GPU效率约为CPU的39倍。
● 若去除内存拷贝时间,则GPU耗时可减少到100ms左右,注意此时若想利用利用CPU计时,则必须在第二次掐表以前加入 cudaDeviceSynchronize(); 。因为核函数调用的同时,主机代码会继续往下执行,只到同步语句或者亟需GPU预算结果的语句才会停止,在上面的例子中,若不加入同步语句,则计时结果总是为0 ms。
● 可以进一步细致调节线程粒度 THREAD,耗时在小范围内有变化。
▶ 改进一,使用共享内存减少全局内存访问量
1 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(mp) 2 { 3 __shared__ float shareA[THREAD * THREAD]; 4 __shared__ float shareB[THREAD * THREAD]; 5 6 int k, t; 7 float temp; 8 int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号 9 int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号 10 11 for (k = 0, temp = 0.0f; k < CEIL(N, THREAD); k++)// N为行程长,k为行程步数 12 { 13 shareA[threadIdx.y * THREAD + threadIdx.x] = (idy < M && k * THREAD + threadIdx.x < N) ? a[idy * N + k * THREAD + threadIdx.x] : 0.0f; 14 shareB[threadIdx.y * THREAD + threadIdx.x] = (idx < P && k * THREAD + threadIdx.y < N) ? b[(k * THREAD + threadIdx.y) * P + idx] : 0.0f; 15 __syncthreads(); 16 17 for (t = 0; t < THREAD; t++)// THREAD为行程长,t为行程步数 18 temp += shareA[threadIdx.y * THREAD + t] * shareB[t * THREAD + threadIdx.x]; 19 __syncthreads(); 20 } 21 if (idx < P && idy < M && idy * P + idx < M * P) 22 c[idy * P + idx] = temp; 23 return; 24 }
● 输出结果
Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031) Start! Initialized! Time: 123.00 ms CPU cost time: 4044.00 ms GPU cost time by CPU API: 147.00 ms GPU cost time by GPU API: 146.75 ms AccRatio = 27.5 Calculate correctly!
▶改进二,共享内存存储方式调整
1 __global__ void times(float *a, float *b, float *c)// A(mn)×B(np)=C(mp) 2 { 3 __shared__ float shareA[THREAD * THREAD]; 4 __shared__ float shareB[THREAD * THREAD]; 5 6 int k, t; 7 float temp; 8 int idx = blockIdx.x * blockDim.x + threadIdx.x;// B、C中精确列号 9 int idy = blockIdx.y * blockDim.y + threadIdx.y;// A、C中精确行号 10 11 for (k = 0, temp = 0.0f; k < CEIL(N, THREAD); k++)// N为行程长,k为行程步数 12 { 13 // shareA 改为转置(原始方法读取 a,写入 shareMemory 时挑适当位置 14 shareA[threadIdx.x * THREAD + threadIdx.y] = (idy < M && k * THREAD + threadIdx.x < N) ? a[idy * N + k * THREAD + threadIdx.x] : 0.0f; 15 shareB[threadIdx.y * THREAD + threadIdx.x] = (idx < P && k * THREAD + threadIdx.y < N) ? b[(k * THREAD + threadIdx.y) * P + idx] : 0.0f; 16 __syncthreads(); 17 18 for (t = 0; t < THREAD; t++)// THREAD为行程长,t为行程步数 19 // 注意A的行列发生了交换 20 temp += shareA[t * THREAD + threadIdx.y] * shareB[t * THREAD + threadIdx.x]; 21 __syncthreads(); 22 } 23 if (idx < P && idy < M && idy * P + idx < M * P) 24 c[idy * P + idx] = temp; 25 return; 26 }
● 输出结果
Matrix size: A(1037, 1055) × B(1055, 1031) == C(1037, 1031) Start! Initialized! Time: 121.00 ms CPU cost time: 4059.00 ms GPU cost time by CPU API: 135.00 ms GPU cost time by GPU API: 134.80 ms AccRatio = 30.1 Calculate correctly!
以上是关于矩阵乘法的主要内容,如果未能解决你的问题,请参考以下文章