各种形式的系数矩阵存储方式及其与一个向量的乘法计算

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了各种形式的系数矩阵存储方式及其与一个向量的乘法计算相关的知识,希望对你有一定的参考价值。

先放上完整代码,以后填坑。

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <malloc.h>
  4 #include <math.h>
  5 #include <time.h>
  6 #include "cuda_runtime.h"
  7 #include "device_launch_parameters.h"
  8 
  9 #define ROW         (256+79)
 10 #define COL         (256+83)
 11 #define EPSILON     0.1         // 系数矩阵非零元素占比
 12 #define NON_ZEROS   ROW * COL   // 非零元素上限(用于分配MAT转ELL时的临时变量大小)
 13 #define THREAD_SIZE 256
 14 #define SEED        1           // (unsigned int)time(MULL)
 15 #define MAX(x,y)    ((x>y)?x:y)
 16 #define CEIL(x,y)   (int)(( x - 1 ) /  y + 1)
 17 #define INT         false        // 使用的数字格式    
 18 
 19 #if INT
 20 typedef int format;
 21 #else
 22 typedef float format;
 23 #endif
 24 
 25 // 矩阵存储格式
 26 typedef struct          // Matrix 顺序格式
 27 {
 28     int     row;        // 行数
 29     int     col;        // 列数
 30     int     count;      // 元素个数(计算时用不到,而是用于转换)
 31     format  *data;      // 实际元素的值
 32 }
 33 MAT;
 34 
 35 typedef struct          // Compressed Sparse Row 格式
 36 {
 37     int     row;        // 行数
 38     int     col;        // 列数
 39     format  *data;      // 实际元素的值
 40     int     *index;     // 元素的列号
 41     int     *ptr;       // 矩阵的行指针 
 42 }
 43 CSR;
 44 
 45 typedef struct          // ELL 格式
 46 {
 47     int     row;        // 行数
 48     int     col;        // 列数
 49     format  *data;      // 实际元素的值
 50     int     *index;     // 元素的列号
 51 }
 52 ELL;
 53 
 54 typedef struct          // Coordinate 格式
 55 {
 56     int     row;        // 行数
 57     int     col;        // 列数
 58     int     count;      // 元素个数
 59     int     *row_index; // 行向量
 60     int     *col_index; // 列向量
 61     format  *data;      // 实际元素的值
 62 }
 63 COO;
 64 
 65 // 通用函数
 66 void checkNULL(const void *input)
 67 {
 68     if (input == NULL)
 69     {
 70         printf("\n\tfind a NULL!");
 71         exit(1);
 72     }
 73     return;
 74 }
 75 
 76 void checkCudaError(cudaError input)
 77 {
 78     if (input != cudaSuccess)
 79     {
 80         printf("\n\tfind a cudaError!");
 81         exit(1);
 82     }
 83     return;
 84 }
 85 
 86 int checkResult(const format * in1, const format * in2, const int length)
 87 // 注意返回值为0(两向量相等)或者“值不等的元素下标加一”(防止0号元素就不相等),返回后若想使用该下标则需要减1
 88 {
 89     int i;
 90     for (i = 0; i < length; i++)
 91     {
 92         if (in1[i] != in2[i])
 93             return i + 1;
 94     }
 95     return 0;
 96 }
 97 
 98 // 打印矩阵
 99 void printMAT(MAT *in)// 打印MAT格式的矩阵
100 {
101     int i;
102     printf("\n\t");
103     for (i = 0; i < in->row * in->col; i++)
104     {
105 #if INT
106         printf("%d ", in->data[i]);
107 #else
108         printf("%.2f ", in->data[i]);
109 #endif
110         if ((i + 1) % in->col == 0)
111             printf("\n\t");
112     }
113     printf("\n");
114     return;
115 }
116 
117 void printCSR(const CSR *in)// 打印CSR格式的矩阵
118 {
119     int i;
120     printf("\n\t%d,%d", in->row, in->col);
121     printf("\n\t");
122     for (i = 0; i < in->ptr[in->row]; i++)
123 #if INT
124         printf("%d ", in->data[i]);
125 #else
126         printf("%.2f ", in->data[i]);
127 #endif
128     printf("\n\t");
129     for (i = 0; i < in->ptr[in->row]; i++)
130         printf("%d ", in->index[i]);
131     printf("\n\t");
132     for (i = 0; i < in->row + 1; i++)
133         printf("%d ", in->ptr[i]);
134     printf("\n\n");
135     return;
136 }
137 
138 void printELL(const ELL *in)// 打印ELL格式的矩阵
139 {
140     int i;
141     printf("\n\t%d,%d", in->row, in->col);
142     printf("\n\t");
143     for (i = 0; i < in->row * in->col; i++)
144     {
145 #if INT
146         printf("%d ", in->data[i]);
147 #else
148         printf("%.2f ", in->data[i]);
149 #endif
150         if ((i + 1) % in->col == 0)
151             printf("\n\t");
152     }
153     printf("\n\t");
154     for (i = 0; i < in->row * in->col; i++)
155     {
156         printf("%d ", in->index[i]);
157         if ((i + 1) % in->col == 0)
158             printf("\n\t");
159     }
160     printf("\n\n");
161     return;
162 }
163 
164 void printCOO(const COO *in)// 打印ELL格式的矩阵
165 {
166     int i;
167     printf("\n\t%d,%d,%d", in->row, in->col, in->count);
168     printf("\n\t");
169     for (i = 0; i < in->count; i++)
170     {
171 #if INT
172         printf("%d ", in->data[i]);
173 #else
174         printf("%.2f ", in->data[i]);
175 #endif
176     }
177     printf("\n\t");
178     for (i = 0; i < in->count; i++)
179     {
180         printf("%d ", in->row_index[i]);
181         printf("%d ", in->col_index[i]);
182     }
183     printf("\n\n");
184     return;
185 }
186 
187 // 矩阵初始化与清理(有问题)
188 MAT *initializeMAT(const int row, const int col, const bool isCPU)// 初始化MAT
189 {
190     MAT *temp;
191     if (isCPU)
192     {
193         checkNULL(temp = (MAT *)malloc(sizeof(MAT)));
194         temp->row = row;
195         temp->col = col;
196         temp->count = 0;
197         checkNULL(temp->data = (format *)malloc(sizeof(format) * row * col));
198     }
199     else
200     {
201         checkCudaError(cudaMalloc((void **)&temp, sizeof(MAT)));
202         temp->row = row;
203         temp->col = col;
204         temp->count = 0;
205         checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * row * col));
206     }
207     return temp;
208 }
209 
210 int countMAT(MAT *in)// 统计MAT形式的矩阵中的非零元素个数
211 {
212     if (in == NULL)
213         return -1;
214 
215     int i, out;
216     for (i = out = 0; i < in->row * in->col; i++)
217         out += (in->data[i] == 0);
218     return out;
219 }
220 
221 int freeMAT(MAT *in, const bool isCPU)// 释放MAT
222 {
223     checkNULL(in);
224     if (isCPU)
225     {
226         free(in->data);
227         free(in);
228     }
229     else
230     {
231         cudaFree(in->data);
232         cudaFree(in);
233     }
234     return 0;
235 }
236 
237 CSR * initializeCSR(const int row, const int col, const int count, const bool isCPU)// 初始化CSR
238 {
239     CSR *temp;
240     if (isCPU)
241     {
242         checkNULL(temp = (CSR *)malloc(sizeof(CSR)));
243         temp->row = row;
244         temp->col = col;
245         checkNULL(temp->data = (format *)malloc(sizeof(format) * count));
246         checkNULL(temp->index = (int *)malloc(sizeof(int) * count));
247         checkNULL(temp->ptr = (int *)malloc(sizeof(int) * (row + 1)));
248     }
249     else
250     {
251         checkCudaError(cudaMalloc((void **)&temp, sizeof(CSR)));
252         //temp->row = row;
253         //temp->col = col;
254         checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * count));
255         checkCudaError(cudaMalloc((void **)&temp->index, sizeof(int) * count));
256         checkCudaError(cudaMalloc((void **)&temp->ptr, sizeof(int) * (row + 1)));
257     }
258     return temp;
259 }
260 
261 int freeCSR(CSR *in, const bool isCPU)// 释放CSR
262 {
263     checkNULL(in);
264     if (isCPU)
265     {
266         free(in->data);
267         free(in->index);
268         free(in->ptr);
269         free(in);
270     }
271     else
272     {
273         cudaFree(in->data);
274         cudaFree(in->index);
275         cudaFree(in->ptr);
276         cudaFree(in);
277     }
278     return 0;
279 }
280 
281 ELL * initializeELL(const int row, const int col, const bool isCPU)// 初始化ELL
282 {
283     ELL *temp;
284     if (isCPU)
285     {
286         checkNULL(temp = (ELL *)malloc(sizeof(ELL)));
287         temp->row = row;
288         temp->col = col;
289         checkNULL(temp->data = (format *)malloc(sizeof(format) * row * col));
290         checkNULL(temp->index = (int *)malloc(sizeof(int) *row * col));
291     }
292     else
293     {
294         checkCudaError(cudaMalloc((void **)&temp, sizeof(ELL)));
295         //temp->row = row;
296         //temp->col = col;
297         checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * row * col));
298         checkCudaError(cudaMalloc((void **)&temp->data, sizeof(int) * row * col));
299     }
300     return temp;
301 }
302 
303 int freeELL(ELL *in, const bool isCPU)// 释放ELL
304 {
305     checkNULL(in);
306     if (isCPU)
307     {
308         free(in->data);
309         free(in->index);
310         free(in);
311     }
312     else
313     {
314         cudaFree(in->data);
315         cudaFree(in->index);
316         cudaFree(in);
317     }
318     return 0;
319 }
320 
321 COO * initializeCOO(const int row, const int col, const int count, const bool isCPU)// 初始化COO
322 {
323     COO * temp;
324     if (isCPU)
325     {
326         checkNULL(temp = (COO *)malloc(sizeof(COO)));
327         temp->row = row;
328         temp->col = col;
329         temp->count = count;
330         checkNULL(temp->data = (format *)malloc(sizeof(format) * count));
331         checkNULL(temp->row_index = (int *)malloc(sizeof(int) * count));
332         checkNULL(temp->col_index = (int *)malloc(sizeof(int) * count));
333     }
334     else
335     {
336         checkCudaError(cudaMalloc((void **)&temp, sizeof(COO)));
337         //temp->row = row;
338         //temp->col = col;
339         //temp->count = count;
340         checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * count));
341         checkCudaError(cudaMalloc((void **)&temp->row_index, sizeof(int) * count));
342         checkCudaError(cudaMalloc((void **)&temp->col_index, sizeof(int) * count));
343     }
344     return temp;
345 }
346 
347 int freeCOO(COO *in, const bool isCPU)// 释放COO
348 {
349     checkNULL(in);
350     if (isCPU)
351     {
352         free(in->data);
353         free(in->row_index);
354         free(in->col_index);
355         free(in);
356     }
357     else
358     {
359         cudaFree(in->data);
360         cudaFree(in->row_index);
361         cudaFree(in->col_index);
362         cudaFree(in);
363     }
364     return 0;
365 }
366 
367 // 矩阵格式的相互转化
368 int MATToCSR(const MAT *in, CSR *out)// MAT格式转化为CSR格式
369 {
370     if (in == NULL || out == NULL)
371         return 1;
372 
373     int i = 0, j = 0, k = 0;
374     out->row = in->row;                  // 行数和列数
375     out->col = in->col;
376     out->ptr[k++] = 0;
377     for (; i < in->row * in->col; i++)
378     {
379         if (in->data[i] != 0)                 // 找到非零元
380         {
381             if (j == in->count)         // 在out->data已经填满了的基础上又发现了非零元
382                 return 1;
383             out->data[j] = in->data[i];       // 填充非零元素
384             out->index[j] = i % in->col;  // 填充列号
385             j++;
386         }
387         if ((i + 1) % in->col == 0)       // 到了最后一列,写入行指针号
388             out->ptr[k++] = j;
389     }
390     return 0;
391 }
392 
393 int CSRToMAT(const CSR *in, MAT *out)// CSR转MAT
394 {
395     if (in == NULL || out == NULL)
396         return 1;
397 
398     int i, j;
399     out->row = in->row;
400     out->col = in->col;
401     for (i = 0; i < in->row; i++)
402     {
403         for (j = 0; j < in->col; j++)
404             out->data[i*in->col + j] = 0;
405         for (j = in->ptr[i]; j < in->ptr[i + 1]; j++)
406             out->data[i*in->col + in->index[j]] = in->data[j];
407     }
408     return 0;
409 }
410 
411 int MATToELL(const MAT *in, ELL *out)// MAT转ELL
412 {
413     if (in == NULL || out == NULL)
414         return 1;
415 
416     int i, j, k;
417     format temp_data[NON_ZEROS];
418     int temp_index[NON_ZEROS];
419     for (i = j = k = 0; i < in->row*in->col; i++)// 寻找最长的行,同时把in中每行的元素往左边推
420     {
421         temp_data[i] = 0;
422         temp_index[i] = 0;
423         if (in->data[i] != 0)                           // 找到非零元
424         {
425             temp_data[i / in->col*in->col + j] = in->data[i];   // 存放元素
426             temp_index[i / in->col*in->col + j] = i % in->col;  // 记录所在的列号
427             j++;                                        // j中为改行已经找到的非零元个数
428         }
429         if ((i + 1) % in->col == 0)                     // 找到了行末
430         {
431             k = MAX(j, k);                              // 更新最大非零元素个数
432             j = 0;                                      // 开始下一行之前清空j
433         }
434     }
435     out->row = k;                                       // 找到了原矩阵最大非零元个数
436     out->col = in->row;                                 // 新矩阵尺寸相当于原矩阵的转置
437     for (i = 0; i < in->row*in->col; i++)               // 使用新的列数重构原来的out
438     {
439         temp_data[i - i / in->col*(in->col - k)] = temp_data[i];// 将data和index往前挪
440         temp_index[i - i / in->col*(in->col - k)] = temp_index[i];
441         if (i >= out->row * k)
442         {
443             temp_data[i] = 0;
444             temp_index[i] = 0;
445         }
446     }
447     for (i = 0; i < out->row*out->col; i++)               // 转置
448     {
449         out->data[i] = temp_data[i % out->col * k + i / out->col];
450         out->index[i] = temp_index[i % out->col * k + i / out->col];
451     }
452     for (; i < in->row*in->col; i++)                     // 补0
453     {
454         out->data[i] = 0;
455         out->index[i] = 0;
456     }
457     return 0;
458 }
459 
460 int ELLToMAT(const ELL *in, MAT *out)// ELL转MAT
461 {
462     if (in == NULL || out == NULL)
463         return 1;
464     return 0;
465 }
466 
467 int MATToCOO(const MAT *in, COO *out)// MAT转COO
468 {
469     if (in == NULL || out == NULL)
470         return 1;
471 
472     int i, j;
473     out->row = in->row;
474     out->col = in->col;
475     for (i = j = 0; i < in->row * in->col; i++)
476     {
477         if (in->data[i] != 0)
478         {
479             out->data[j] = in->data[i];
480             out->row_index[j] = i / in->col;
481             out->col_index[j] = i % in->col;
482             j++;
483         }
484     }
485     out->count = j;
486     return 0;
487 }
488 
489 int COOToMAT(const COO *in, MAT *out)// COO转MAT
490 {
491     if (in == NULL || out == NULL)
492         return 1;
493 
494     int i;
495     out->row = in->row;
496     out->col = in->col;
497     for (i = 0; i < in->row * in->col; i++)
498         out->data[i] = 0;
499     for (i = 0; i < in->count; i++)
500         out->data[in->row_index[i] * in->col + in->col_index[i]] = in->data[i];
501     return 0;
502 }
503 
504 int CSRToELL(const CSR *in, ELL *out)// CSR转ELL
505 {
506     if (in == NULL || out == NULL)
507         return 1;
508     return 0;
509 }
510 
511 int ELLToCSR(const ELL *in, CSR *out)// ELL转CSR
512 {
513     if (in == NULL || out == NULL)
514         return 1;
515     return 0;
516 }
517 
518 int CSRToCOO(const CSR *in, COO *out)// CSR转COO
519 {
520     if (in == NULL || out == NULL)
521         return 1;
522     return 0;
523 }
524 
525 int COOToCSR(const COO *in, CSR *out)// COO转CSR
526 {
527     if (in == NULL || out == NULL)
528         return 1;
529     return 0;
530 }
531 
532 int ELLToCOO(const ELL *in, COO *out)// ELL转COO
533 {
534     if (in == NULL || out == NULL)
535         return 1;
536     return 0;
537 }
538 
539 int COOToELL(const COO *in, ELL *out)// COO转ELL
540 {
541     if (in == NULL || out == NULL)
542         return 1;
543 
544     return 0;
545 }
546 
547 // 各种形式的矩阵和一个向量的乘法
548 int dotMATCPU(const MAT *a, const MAT *x, MAT *y)// CPU MAT乘法
549 {
550     if (a == NULL || x == NULL || y == NULL || a->col != x->row)
551         return 1;
552 
553     int i;
554     format sum;
555     y->row = a->row;
556     y->col = x->col;
557     for (i = 0, sum = 0; i < a->row * a->col; i++)
558     {
559         sum += a->data[i] * x->data[i % a->col];
560         if ((i + 1) % a->col == 0)
561         {
562             y->data[i / a->col] = sum;
563             sum = 0;
564         }
565     }
566     return 0;
567 }
568 
569 int dotCSRCPU(const CSR *a, const MAT *x, MAT *y)// CPU CSR乘法
570 {
571     if (a == NULL || x == NULL || y == NULL || a->col != x->row)
572         return 1;
573 
574     int i, j;
575     format sum;
576     y->row = a->row;
577     y->col = x->col;
578     for (i = 0; i < a->row; i++)
579     {
580         for (sum = 0, j = a->ptr[i]; j < a->ptr[i + 1]; j++)
581             sum += a->data[j] * x->data[a->index[j]];
582         y->data[i] = sum;
583     }
584     return 0;
585 }
586 
587 int dotELLCPU(const ELL *a, const MAT *x, MAT *y)// CPU ELL乘法
588 {
589     if (a == NULL || x == NULL || y == NULL || a->col != x->row)
590         return 1;
591 
592     int i, j;
593     format sum;
594     y->row = a->row;
595     y->col = x->col;
596     for (i = 0; i<a->row; i++)
597     {
598         for (sum = 0, j = 0; j < a->col; j++)
599             sum += a->data[i + j*a->row] * x->data[a->index[i + j*a->row]];
600         y->data[i] = sum;
601     }
602     return 0;
603 }
604 
605 int dotCOOCPU(const COO *a, const MAT *x, MAT *y)// CPU COO乘法
606 {
607     if (a == NULL || x == NULL || y == NULL || a->col != x->row)
608         return 1;
609 
610     int i;
611     y->row = a->row;
612     y->col = x->col;
613     for (i = 0; i < a->row; i++)
614         y->data[i] = 0;
615     for (i = 0; i<a->count; i++)
616         y->data[a->row_index[i]] += a->data[i] * x->data[a->col_index[i]];
617     return 0;
618 }
619 
620 __global__ void dotMATGPU(const MAT *a, const MAT *x, MAT *y)// GPU MAT乘法
621 {
622     int id = blockIdx.x * blockDim.x + threadIdx.x;
623     if (id == 0)
624     {
625         y->row = a->row;
626         y->col = x->col;
627     }
628     if (id < a->row)
629     {
630         int i;
631         format sum = 0;
632         for (i = 0; i < a->col; i++)
633             sum += a->data[id * a->col + i] * x->data[i];
634         y->data[id] = sum;
635     }
636     return;
637 }
638 
639 __global__ void dotCSRGPU(const CSR *a, const MAT *x, MAT *y)// GPU CSR乘法
640 {
641     int id = blockIdx.x * blockDim.x + threadIdx.x;
642     if (id == 0)
643     {
644         y->row = a->row;
645         y->col = x->col;
646     }
647     if (id < a->row)
648     {
649         int j;
650         format sum;
651         for (sum = 0, j = a->ptr[id]; j < a->ptr[id + 1]; j++)
652             sum += a->data[j] * x->data[a->index[j]];
653         y->data[id] = sum;
654     }
655     return;
656 }
657 
658 __global__ void dotELLGPU(const ELL *a, const MAT *x, MAT *y)// GPU ELL乘法
659 {
660     int id = blockIdx.x * blockDim.x + threadIdx.x;
661     if (id == 0)
662     {
663         y->row = a->row;
664         y->col = x->col;
665     }
666     if (id < a->row)
667     {
668         int j;
669         format sum;
670         for (sum = 0, j = 0; j < a->col; j++)
671             sum += a->data[id + j*a->row] * x->data[a->index[id + j*a->row]];
672         y->data[id] = sum;
673     }
674     return;
675 }
676 
677 __global__ void dotCOOGPU(const COO *a, const MAT *x, MAT *y)// GPU COO乘法
678 {
679     int id = blockIdx.x * blockDim.x + threadIdx.x;
680     if (id == 0)
681     {
682         y->row = a->row;
683         y->col = x->col;
684     }
685     if (id < a->count)
686         atomicAdd(&(y->data[a->row_index[id]]), a->data[id] * x->data[a->col_index[id]]);
687     return;
688 }
689 
690 int test()// 测试矩阵打印和CPU计算的函数
691 {
692     int i;
693     MAT a, c, x, y;
694     CSR b;
695     ELL d;
696     COO e;
697 
698     a.row = 4, a.col = 5;
699     for (i = 0; i < a.row * a.col; i++)
700         a.data[i] = 0;
701     a.data[0] = 3, a.data[2] = 1, a.data[4] = 5, a.data[11] = 2;
702     a.data[12] = 4, a.data[13] = 1, a.data[15] = 1, a.data[18] = 1;
703 
704     x.row = a.col, x.col = 1;
705     for (i = 0; i < x.row; i++)
706         x.data[i] = i;
707 
708     printMAT(&a);           // MAT型
709 
710     MATToCSR(&a, &b);       // MAT转CSR
711     printCSR(&b);
712 
713     CSRToMAT(&b, &c);       // CSR转回MAT型
714     printMAT(&c);
715 
716     MATToELL(&a, &d);       // ELL型
717     printELL(&d);
718 
719     MATToCOO(&a, &e);       // MAT转COO
720     printCOO(&e);
721 
722     COOToMAT(&e, &c);       // COO转回MAT
723     printMAT(&c);
724 
725     dotMATCPU(&a, &x, &y);  // MAT乘法
726     printMAT(&y);
727 
728     dotCSRCPU(&b, &x, &y);  // CSR乘法
729     printMAT(&y);
730 
731     dotELLCPU(&d, &x, &y);  // ELL乘法
732     printMAT(&y);
733 
734     dotCOOCPU(&e, &x, &y);  // COO乘法
735     printMAT(&y);
736 
737     getchar();
738     return 0;
739 }
740 
741 int main()
742 {
743     int i;
744     MAT *h_aMAT, *d_aMAT, *h_xMAT, *d_xMAT, *h_yMAT_1, *h_yMAT_2, *d_yMAT;
745     CSR *h_aCSR, *d_aCSR;
746     ELL *h_aELL, *d_aELL;
747     COO *h_aCOO, *d_aCOO;
748 
749     clock_t timeCPU;
750     cudaEvent_t startGPU, stopGPU;
751     float timeGPU;
752     cudaEventCreate(&startGPU);
753     cudaEventCreate(&stopGPU);
754 
755     printf("\n\tStart! Matrix dimension:\t%d × %d", ROW, COL);
756 
757     h_aMAT = initializeMAT(ROW, COL, 1);
758     h_xMAT = initializeMAT(COL, 1, 1);
759     h_yMAT_1 = initializeMAT(ROW, 1, 1);
760     h_yMAT_2 = initializeMAT(ROW, 1, 1);
761 
762     // 初始化
763     timeCPU = clock();
764     srand(SEED);
765 #if INT
766     for (i = 0; i < COL * ROW; i++)
767         h_aMAT->data[i] = ((float)rand() < RAND_MAX * EPSILON) ? (rand() - RAND_MAX / 2) : 0;
768     h_aMAT->count = countMAT(h_aMAT);
769     for (i = 0; i < COL; i++)
770         h_xMAT->data[i] = (rand() - RAND_MAX / 2);
771     //h_xMAT->count = countMAT(h_xMAT);
772 #else
773     for (i = 0; i < COL * ROW; i++)
774         h_aMAT->data[i] = ((float)rand() < RAND_MAX * EPSILON) ? ((float)rand() / RAND_MAX) : 0;
775     h_aMAT->count = countMAT(h_aMAT);
776     for (i = 0; i < COL; i++)
777         h_xMAT->data[i] = ((float)rand() / RAND_MAX);
778     //h_xMAT->count = countMAT(h_xMAT);
779 #endif
780     printf("\n\tInitialized! Time:\t\t%8.3f ms\n", (format)(clock() - timeCPU));
781 
782     //CPU计算部分
783     //MAT 格式
784     timeCPU = clock();
785     dotMATCPU(h_aMAT, h_xMAT, h_yMAT_1);
786     timeCPU = clock() - timeCPU;
787     printf("\n\tdotMATCPU time:\t%8.3f ms\n", (float)timeCPU);
788 
789     // CSR格式                    
790     h_aCSR = initializeCSR(ROW, COL, h_aMAT->count, 1);
791     MATToCSR(h_aMAT, h_aCSR);
792     timeCPU = clock();
793     dotCSRCPU(h_aCSR, h_xMAT, h_yMAT_2);
794     timeCPU = clock() - timeCPU;
795     printf("\n\tdotCSRCPU time:\t%8.3f ms\n", (float)timeCPU);
796     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
797         printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n",
798             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
799     else
800         printf("\n\tdotCSRCPU correct!\n");
801 
802     // ELL格式
803     h_aELL = initializeELL(ROW, COL, 1);
804     MATToELL(h_aMAT, h_aELL);
805     timeCPU = clock();
806     dotELLCPU(h_aELL, h_xMAT, h_yMAT_2);
807     timeCPU = clock() - timeCPU;
808     printf("\n\tdotCELLPU time:\t%8.3f ms\n", (float)timeCPU);
809     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
810         printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n",
811             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
812     else
813         printf("\n\tdotELLCPU correct!\n");
814 
815     // COO格式
816     h_aCOO = initializeCOO(ROW, COL, h_aMAT->count, 1);
817     MATToCOO(h_aMAT, h_aCOO);
818     timeCPU = clock();
819     dotCOOCPU(h_aCOO, h_xMAT, h_yMAT_2);
820     timeCPU = clock() - timeCPU;
821     printf("\n\tdotCOOCPU time:\t%8.3f ms\n", (float)timeCPU);
822     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
823         printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n",
824             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
825     else
826         printf("\n\tdotCOOCPU correct!\n");
827 
828     // GPU计算部分
829     d_aMAT = initializeMAT(ROW, COL, 0);
830     d_xMAT = initializeMAT(ROW, COL, 0);
831     d_yMAT = initializeMAT(ROW, COL, 0);
832     d_aCSR = initializeCSR(ROW, COL, h_aMAT->count, 0);
833     d_aELL = initializeELL(ROW, COL, 0);
834     d_aCOO = initializeCOO(ROW, COL, h_aMAT->count, 0);
835 
836     cudaMemcpy(d_aMAT, h_aMAT, sizeof(MAT), cudaMemcpyHostToDevice);
837     cudaMemcpy(d_xMAT, h_xMAT, sizeof(format) * COL, cudaMemcpyHostToDevice);
838     cudaMemcpy(d_aCSR, h_aCSR, sizeof(CSR), cudaMemcpyHostToDevice);
839     cudaMemcpy(d_aELL, h_aELL, sizeof(CSR), cudaMemcpyHostToDevice);
840     cudaMemcpy(d_aCOO, h_aCOO, sizeof(CSR), cudaMemcpyHostToDevice);
841 
842     // MAT格式
843     cudaMemset(d_yMAT, 0, sizeof(MAT));
844     cudaEventRecord(startGPU, 0);
845     dotMATGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aMAT, d_xMAT, d_yMAT);
846     cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost);
847     cudaDeviceSynchronize();
848     cudaEventRecord(stopGPU, 0);
849     cudaEventSynchronize(stopGPU);
850     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
851     printf("\n\tdotMATGPU time:\t%8.3f ms\n", timeGPU);
852     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
853         printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n",
854             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
855     else
856         printf("\n\tdotMATGPU correct!\n");
857 
858     // CSR格式
859     cudaMemset(d_yMAT, 0, sizeof(MAT));
860     cudaEventRecord(startGPU, 0);
861     dotCSRGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aCSR, d_xMAT, d_yMAT);
862     cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost);
863     cudaDeviceSynchronize();
864     cudaEventRecord(stopGPU, 0);
865     cudaEventSynchronize(stopGPU);
866     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
867     printf("\n\tdotCSRGPU time:\t%8.3f ms\n", timeGPU);
868     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
869         printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n",
870             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
871     else
872         printf("\n\tdotCSRGPU correct!\n");
873 
874     // ELL格式
875     cudaMemset(d_yMAT, 0, sizeof(MAT));
876     cudaEventRecord(startGPU, 0);
877     dotELLGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aELL, d_xMAT, d_yMAT);
878     cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost);
879     cudaDeviceSynchronize();
880     cudaEventRecord(stopGPU, 0);
881     cudaEventSynchronize(stopGPU);
882     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
883     printf("\n\tdotELLGPU time:\t%8.3f ms\n", timeGPU);
884     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
885         printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n",
886             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
887     else
888         printf("\n\tdotELLGPU correct!\n");
889 
890     // COO格式
891     cudaMemset(d_yMAT, 0, sizeof(MAT));
892     cudaEventRecord(startGPU, 0);
893     dotCOOGPU << <CEIL(h_aMAT->count, THREAD_SIZE), THREAD_SIZE >> > (d_aCOO, d_xMAT, d_yMAT);
894     cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost);
895     cudaDeviceSynchronize();
896     cudaEventRecord(stopGPU, 0);
897     cudaEventSynchronize(stopGPU);
898     cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);
899     printf("\n\tdotCOOGPU time:\t%8.3f ms\n", timeGPU);
900     if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW))
901         printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n",
902             i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]);
903     else
904         printf("\n\tdotCOOGPU correct!\n");
905 
906     freeMAT(h_aMAT, 1);
907     freeMAT(h_xMAT, 1);
908     freeMAT(h_yMAT_1, 1);
909     freeMAT(h_yMAT_2, 1);
910     freeCSR(h_aCSR, 1);
911     freeELL(h_aELL, 1);
912     freeCOO(h_aCOO, 1);
913     freeMAT(d_aMAT, 0); 
914     freeMAT(d_xMAT, 0); 
915     freeMAT(d_yMAT, 0); 
916     freeCSR(d_aCSR, 0); 
917     freeELL(d_aELL, 0); 
918     freeCOO(d_aCOO, 0); 
919     getchar();
920     return 0;
921 }

 

以上是关于各种形式的系数矩阵存储方式及其与一个向量的乘法计算的主要内容,如果未能解决你的问题,请参考以下文章

最小二乘法--多特征(矩阵形式)

1.1 基本算法和记号

线性回归——最小二乘法

python与数据分析Numpy数值计算基础——补充

numpy数组与矩阵运算

向量矩阵乘法、浮点向量、二进制矩阵