各种形式的系数矩阵存储方式及其与一个向量的乘法计算
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了各种形式的系数矩阵存储方式及其与一个向量的乘法计算相关的知识,希望对你有一定的参考价值。
先放上完整代码,以后填坑。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <malloc.h> 4 #include <math.h> 5 #include <time.h> 6 #include "cuda_runtime.h" 7 #include "device_launch_parameters.h" 8 9 #define ROW (256+79) 10 #define COL (256+83) 11 #define EPSILON 0.1 // 系数矩阵非零元素占比 12 #define NON_ZEROS ROW * COL // 非零元素上限(用于分配MAT转ELL时的临时变量大小) 13 #define THREAD_SIZE 256 14 #define SEED 1 // (unsigned int)time(MULL) 15 #define MAX(x,y) ((x>y)?x:y) 16 #define CEIL(x,y) (int)(( x - 1 ) / y + 1) 17 #define INT false // 使用的数字格式 18 19 #if INT 20 typedef int format; 21 #else 22 typedef float format; 23 #endif 24 25 // 矩阵存储格式 26 typedef struct // Matrix 顺序格式 27 { 28 int row; // 行数 29 int col; // 列数 30 int count; // 元素个数(计算时用不到,而是用于转换) 31 format *data; // 实际元素的值 32 } 33 MAT; 34 35 typedef struct // Compressed Sparse Row 格式 36 { 37 int row; // 行数 38 int col; // 列数 39 format *data; // 实际元素的值 40 int *index; // 元素的列号 41 int *ptr; // 矩阵的行指针 42 } 43 CSR; 44 45 typedef struct // ELL 格式 46 { 47 int row; // 行数 48 int col; // 列数 49 format *data; // 实际元素的值 50 int *index; // 元素的列号 51 } 52 ELL; 53 54 typedef struct // Coordinate 格式 55 { 56 int row; // 行数 57 int col; // 列数 58 int count; // 元素个数 59 int *row_index; // 行向量 60 int *col_index; // 列向量 61 format *data; // 实际元素的值 62 } 63 COO; 64 65 // 通用函数 66 void checkNULL(const void *input) 67 { 68 if (input == NULL) 69 { 70 printf("\n\tfind a NULL!"); 71 exit(1); 72 } 73 return; 74 } 75 76 void checkCudaError(cudaError input) 77 { 78 if (input != cudaSuccess) 79 { 80 printf("\n\tfind a cudaError!"); 81 exit(1); 82 } 83 return; 84 } 85 86 int checkResult(const format * in1, const format * in2, const int length) 87 // 注意返回值为0(两向量相等)或者“值不等的元素下标加一”(防止0号元素就不相等),返回后若想使用该下标则需要减1 88 { 89 int i; 90 for (i = 0; i < length; i++) 91 { 92 if (in1[i] != in2[i]) 93 return i + 1; 94 } 95 return 0; 96 } 97 98 // 打印矩阵 99 void printMAT(MAT *in)// 打印MAT格式的矩阵 100 { 101 int i; 102 printf("\n\t"); 103 for (i = 0; i < in->row * in->col; i++) 104 { 105 #if INT 106 printf("%d ", in->data[i]); 107 #else 108 printf("%.2f ", in->data[i]); 109 #endif 110 if ((i + 1) % in->col == 0) 111 printf("\n\t"); 112 } 113 printf("\n"); 114 return; 115 } 116 117 void printCSR(const CSR *in)// 打印CSR格式的矩阵 118 { 119 int i; 120 printf("\n\t%d,%d", in->row, in->col); 121 printf("\n\t"); 122 for (i = 0; i < in->ptr[in->row]; i++) 123 #if INT 124 printf("%d ", in->data[i]); 125 #else 126 printf("%.2f ", in->data[i]); 127 #endif 128 printf("\n\t"); 129 for (i = 0; i < in->ptr[in->row]; i++) 130 printf("%d ", in->index[i]); 131 printf("\n\t"); 132 for (i = 0; i < in->row + 1; i++) 133 printf("%d ", in->ptr[i]); 134 printf("\n\n"); 135 return; 136 } 137 138 void printELL(const ELL *in)// 打印ELL格式的矩阵 139 { 140 int i; 141 printf("\n\t%d,%d", in->row, in->col); 142 printf("\n\t"); 143 for (i = 0; i < in->row * in->col; i++) 144 { 145 #if INT 146 printf("%d ", in->data[i]); 147 #else 148 printf("%.2f ", in->data[i]); 149 #endif 150 if ((i + 1) % in->col == 0) 151 printf("\n\t"); 152 } 153 printf("\n\t"); 154 for (i = 0; i < in->row * in->col; i++) 155 { 156 printf("%d ", in->index[i]); 157 if ((i + 1) % in->col == 0) 158 printf("\n\t"); 159 } 160 printf("\n\n"); 161 return; 162 } 163 164 void printCOO(const COO *in)// 打印ELL格式的矩阵 165 { 166 int i; 167 printf("\n\t%d,%d,%d", in->row, in->col, in->count); 168 printf("\n\t"); 169 for (i = 0; i < in->count; i++) 170 { 171 #if INT 172 printf("%d ", in->data[i]); 173 #else 174 printf("%.2f ", in->data[i]); 175 #endif 176 } 177 printf("\n\t"); 178 for (i = 0; i < in->count; i++) 179 { 180 printf("%d ", in->row_index[i]); 181 printf("%d ", in->col_index[i]); 182 } 183 printf("\n\n"); 184 return; 185 } 186 187 // 矩阵初始化与清理(有问题) 188 MAT *initializeMAT(const int row, const int col, const bool isCPU)// 初始化MAT 189 { 190 MAT *temp; 191 if (isCPU) 192 { 193 checkNULL(temp = (MAT *)malloc(sizeof(MAT))); 194 temp->row = row; 195 temp->col = col; 196 temp->count = 0; 197 checkNULL(temp->data = (format *)malloc(sizeof(format) * row * col)); 198 } 199 else 200 { 201 checkCudaError(cudaMalloc((void **)&temp, sizeof(MAT))); 202 temp->row = row; 203 temp->col = col; 204 temp->count = 0; 205 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * row * col)); 206 } 207 return temp; 208 } 209 210 int countMAT(MAT *in)// 统计MAT形式的矩阵中的非零元素个数 211 { 212 if (in == NULL) 213 return -1; 214 215 int i, out; 216 for (i = out = 0; i < in->row * in->col; i++) 217 out += (in->data[i] == 0); 218 return out; 219 } 220 221 int freeMAT(MAT *in, const bool isCPU)// 释放MAT 222 { 223 checkNULL(in); 224 if (isCPU) 225 { 226 free(in->data); 227 free(in); 228 } 229 else 230 { 231 cudaFree(in->data); 232 cudaFree(in); 233 } 234 return 0; 235 } 236 237 CSR * initializeCSR(const int row, const int col, const int count, const bool isCPU)// 初始化CSR 238 { 239 CSR *temp; 240 if (isCPU) 241 { 242 checkNULL(temp = (CSR *)malloc(sizeof(CSR))); 243 temp->row = row; 244 temp->col = col; 245 checkNULL(temp->data = (format *)malloc(sizeof(format) * count)); 246 checkNULL(temp->index = (int *)malloc(sizeof(int) * count)); 247 checkNULL(temp->ptr = (int *)malloc(sizeof(int) * (row + 1))); 248 } 249 else 250 { 251 checkCudaError(cudaMalloc((void **)&temp, sizeof(CSR))); 252 //temp->row = row; 253 //temp->col = col; 254 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * count)); 255 checkCudaError(cudaMalloc((void **)&temp->index, sizeof(int) * count)); 256 checkCudaError(cudaMalloc((void **)&temp->ptr, sizeof(int) * (row + 1))); 257 } 258 return temp; 259 } 260 261 int freeCSR(CSR *in, const bool isCPU)// 释放CSR 262 { 263 checkNULL(in); 264 if (isCPU) 265 { 266 free(in->data); 267 free(in->index); 268 free(in->ptr); 269 free(in); 270 } 271 else 272 { 273 cudaFree(in->data); 274 cudaFree(in->index); 275 cudaFree(in->ptr); 276 cudaFree(in); 277 } 278 return 0; 279 } 280 281 ELL * initializeELL(const int row, const int col, const bool isCPU)// 初始化ELL 282 { 283 ELL *temp; 284 if (isCPU) 285 { 286 checkNULL(temp = (ELL *)malloc(sizeof(ELL))); 287 temp->row = row; 288 temp->col = col; 289 checkNULL(temp->data = (format *)malloc(sizeof(format) * row * col)); 290 checkNULL(temp->index = (int *)malloc(sizeof(int) *row * col)); 291 } 292 else 293 { 294 checkCudaError(cudaMalloc((void **)&temp, sizeof(ELL))); 295 //temp->row = row; 296 //temp->col = col; 297 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * row * col)); 298 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(int) * row * col)); 299 } 300 return temp; 301 } 302 303 int freeELL(ELL *in, const bool isCPU)// 释放ELL 304 { 305 checkNULL(in); 306 if (isCPU) 307 { 308 free(in->data); 309 free(in->index); 310 free(in); 311 } 312 else 313 { 314 cudaFree(in->data); 315 cudaFree(in->index); 316 cudaFree(in); 317 } 318 return 0; 319 } 320 321 COO * initializeCOO(const int row, const int col, const int count, const bool isCPU)// 初始化COO 322 { 323 COO * temp; 324 if (isCPU) 325 { 326 checkNULL(temp = (COO *)malloc(sizeof(COO))); 327 temp->row = row; 328 temp->col = col; 329 temp->count = count; 330 checkNULL(temp->data = (format *)malloc(sizeof(format) * count)); 331 checkNULL(temp->row_index = (int *)malloc(sizeof(int) * count)); 332 checkNULL(temp->col_index = (int *)malloc(sizeof(int) * count)); 333 } 334 else 335 { 336 checkCudaError(cudaMalloc((void **)&temp, sizeof(COO))); 337 //temp->row = row; 338 //temp->col = col; 339 //temp->count = count; 340 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * count)); 341 checkCudaError(cudaMalloc((void **)&temp->row_index, sizeof(int) * count)); 342 checkCudaError(cudaMalloc((void **)&temp->col_index, sizeof(int) * count)); 343 } 344 return temp; 345 } 346 347 int freeCOO(COO *in, const bool isCPU)// 释放COO 348 { 349 checkNULL(in); 350 if (isCPU) 351 { 352 free(in->data); 353 free(in->row_index); 354 free(in->col_index); 355 free(in); 356 } 357 else 358 { 359 cudaFree(in->data); 360 cudaFree(in->row_index); 361 cudaFree(in->col_index); 362 cudaFree(in); 363 } 364 return 0; 365 } 366 367 // 矩阵格式的相互转化 368 int MATToCSR(const MAT *in, CSR *out)// MAT格式转化为CSR格式 369 { 370 if (in == NULL || out == NULL) 371 return 1; 372 373 int i = 0, j = 0, k = 0; 374 out->row = in->row; // 行数和列数 375 out->col = in->col; 376 out->ptr[k++] = 0; 377 for (; i < in->row * in->col; i++) 378 { 379 if (in->data[i] != 0) // 找到非零元 380 { 381 if (j == in->count) // 在out->data已经填满了的基础上又发现了非零元 382 return 1; 383 out->data[j] = in->data[i]; // 填充非零元素 384 out->index[j] = i % in->col; // 填充列号 385 j++; 386 } 387 if ((i + 1) % in->col == 0) // 到了最后一列,写入行指针号 388 out->ptr[k++] = j; 389 } 390 return 0; 391 } 392 393 int CSRToMAT(const CSR *in, MAT *out)// CSR转MAT 394 { 395 if (in == NULL || out == NULL) 396 return 1; 397 398 int i, j; 399 out->row = in->row; 400 out->col = in->col; 401 for (i = 0; i < in->row; i++) 402 { 403 for (j = 0; j < in->col; j++) 404 out->data[i*in->col + j] = 0; 405 for (j = in->ptr[i]; j < in->ptr[i + 1]; j++) 406 out->data[i*in->col + in->index[j]] = in->data[j]; 407 } 408 return 0; 409 } 410 411 int MATToELL(const MAT *in, ELL *out)// MAT转ELL 412 { 413 if (in == NULL || out == NULL) 414 return 1; 415 416 int i, j, k; 417 format temp_data[NON_ZEROS]; 418 int temp_index[NON_ZEROS]; 419 for (i = j = k = 0; i < in->row*in->col; i++)// 寻找最长的行,同时把in中每行的元素往左边推 420 { 421 temp_data[i] = 0; 422 temp_index[i] = 0; 423 if (in->data[i] != 0) // 找到非零元 424 { 425 temp_data[i / in->col*in->col + j] = in->data[i]; // 存放元素 426 temp_index[i / in->col*in->col + j] = i % in->col; // 记录所在的列号 427 j++; // j中为改行已经找到的非零元个数 428 } 429 if ((i + 1) % in->col == 0) // 找到了行末 430 { 431 k = MAX(j, k); // 更新最大非零元素个数 432 j = 0; // 开始下一行之前清空j 433 } 434 } 435 out->row = k; // 找到了原矩阵最大非零元个数 436 out->col = in->row; // 新矩阵尺寸相当于原矩阵的转置 437 for (i = 0; i < in->row*in->col; i++) // 使用新的列数重构原来的out 438 { 439 temp_data[i - i / in->col*(in->col - k)] = temp_data[i];// 将data和index往前挪 440 temp_index[i - i / in->col*(in->col - k)] = temp_index[i]; 441 if (i >= out->row * k) 442 { 443 temp_data[i] = 0; 444 temp_index[i] = 0; 445 } 446 } 447 for (i = 0; i < out->row*out->col; i++) // 转置 448 { 449 out->data[i] = temp_data[i % out->col * k + i / out->col]; 450 out->index[i] = temp_index[i % out->col * k + i / out->col]; 451 } 452 for (; i < in->row*in->col; i++) // 补0 453 { 454 out->data[i] = 0; 455 out->index[i] = 0; 456 } 457 return 0; 458 } 459 460 int ELLToMAT(const ELL *in, MAT *out)// ELL转MAT 461 { 462 if (in == NULL || out == NULL) 463 return 1; 464 return 0; 465 } 466 467 int MATToCOO(const MAT *in, COO *out)// MAT转COO 468 { 469 if (in == NULL || out == NULL) 470 return 1; 471 472 int i, j; 473 out->row = in->row; 474 out->col = in->col; 475 for (i = j = 0; i < in->row * in->col; i++) 476 { 477 if (in->data[i] != 0) 478 { 479 out->data[j] = in->data[i]; 480 out->row_index[j] = i / in->col; 481 out->col_index[j] = i % in->col; 482 j++; 483 } 484 } 485 out->count = j; 486 return 0; 487 } 488 489 int COOToMAT(const COO *in, MAT *out)// COO转MAT 490 { 491 if (in == NULL || out == NULL) 492 return 1; 493 494 int i; 495 out->row = in->row; 496 out->col = in->col; 497 for (i = 0; i < in->row * in->col; i++) 498 out->data[i] = 0; 499 for (i = 0; i < in->count; i++) 500 out->data[in->row_index[i] * in->col + in->col_index[i]] = in->data[i]; 501 return 0; 502 } 503 504 int CSRToELL(const CSR *in, ELL *out)// CSR转ELL 505 { 506 if (in == NULL || out == NULL) 507 return 1; 508 return 0; 509 } 510 511 int ELLToCSR(const ELL *in, CSR *out)// ELL转CSR 512 { 513 if (in == NULL || out == NULL) 514 return 1; 515 return 0; 516 } 517 518 int CSRToCOO(const CSR *in, COO *out)// CSR转COO 519 { 520 if (in == NULL || out == NULL) 521 return 1; 522 return 0; 523 } 524 525 int COOToCSR(const COO *in, CSR *out)// COO转CSR 526 { 527 if (in == NULL || out == NULL) 528 return 1; 529 return 0; 530 } 531 532 int ELLToCOO(const ELL *in, COO *out)// ELL转COO 533 { 534 if (in == NULL || out == NULL) 535 return 1; 536 return 0; 537 } 538 539 int COOToELL(const COO *in, ELL *out)// COO转ELL 540 { 541 if (in == NULL || out == NULL) 542 return 1; 543 544 return 0; 545 } 546 547 // 各种形式的矩阵和一个向量的乘法 548 int dotMATCPU(const MAT *a, const MAT *x, MAT *y)// CPU MAT乘法 549 { 550 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 551 return 1; 552 553 int i; 554 format sum; 555 y->row = a->row; 556 y->col = x->col; 557 for (i = 0, sum = 0; i < a->row * a->col; i++) 558 { 559 sum += a->data[i] * x->data[i % a->col]; 560 if ((i + 1) % a->col == 0) 561 { 562 y->data[i / a->col] = sum; 563 sum = 0; 564 } 565 } 566 return 0; 567 } 568 569 int dotCSRCPU(const CSR *a, const MAT *x, MAT *y)// CPU CSR乘法 570 { 571 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 572 return 1; 573 574 int i, j; 575 format sum; 576 y->row = a->row; 577 y->col = x->col; 578 for (i = 0; i < a->row; i++) 579 { 580 for (sum = 0, j = a->ptr[i]; j < a->ptr[i + 1]; j++) 581 sum += a->data[j] * x->data[a->index[j]]; 582 y->data[i] = sum; 583 } 584 return 0; 585 } 586 587 int dotELLCPU(const ELL *a, const MAT *x, MAT *y)// CPU ELL乘法 588 { 589 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 590 return 1; 591 592 int i, j; 593 format sum; 594 y->row = a->row; 595 y->col = x->col; 596 for (i = 0; i<a->row; i++) 597 { 598 for (sum = 0, j = 0; j < a->col; j++) 599 sum += a->data[i + j*a->row] * x->data[a->index[i + j*a->row]]; 600 y->data[i] = sum; 601 } 602 return 0; 603 } 604 605 int dotCOOCPU(const COO *a, const MAT *x, MAT *y)// CPU COO乘法 606 { 607 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 608 return 1; 609 610 int i; 611 y->row = a->row; 612 y->col = x->col; 613 for (i = 0; i < a->row; i++) 614 y->data[i] = 0; 615 for (i = 0; i<a->count; i++) 616 y->data[a->row_index[i]] += a->data[i] * x->data[a->col_index[i]]; 617 return 0; 618 } 619 620 __global__ void dotMATGPU(const MAT *a, const MAT *x, MAT *y)// GPU MAT乘法 621 { 622 int id = blockIdx.x * blockDim.x + threadIdx.x; 623 if (id == 0) 624 { 625 y->row = a->row; 626 y->col = x->col; 627 } 628 if (id < a->row) 629 { 630 int i; 631 format sum = 0; 632 for (i = 0; i < a->col; i++) 633 sum += a->data[id * a->col + i] * x->data[i]; 634 y->data[id] = sum; 635 } 636 return; 637 } 638 639 __global__ void dotCSRGPU(const CSR *a, const MAT *x, MAT *y)// GPU CSR乘法 640 { 641 int id = blockIdx.x * blockDim.x + threadIdx.x; 642 if (id == 0) 643 { 644 y->row = a->row; 645 y->col = x->col; 646 } 647 if (id < a->row) 648 { 649 int j; 650 format sum; 651 for (sum = 0, j = a->ptr[id]; j < a->ptr[id + 1]; j++) 652 sum += a->data[j] * x->data[a->index[j]]; 653 y->data[id] = sum; 654 } 655 return; 656 } 657 658 __global__ void dotELLGPU(const ELL *a, const MAT *x, MAT *y)// GPU ELL乘法 659 { 660 int id = blockIdx.x * blockDim.x + threadIdx.x; 661 if (id == 0) 662 { 663 y->row = a->row; 664 y->col = x->col; 665 } 666 if (id < a->row) 667 { 668 int j; 669 format sum; 670 for (sum = 0, j = 0; j < a->col; j++) 671 sum += a->data[id + j*a->row] * x->data[a->index[id + j*a->row]]; 672 y->data[id] = sum; 673 } 674 return; 675 } 676 677 __global__ void dotCOOGPU(const COO *a, const MAT *x, MAT *y)// GPU COO乘法 678 { 679 int id = blockIdx.x * blockDim.x + threadIdx.x; 680 if (id == 0) 681 { 682 y->row = a->row; 683 y->col = x->col; 684 } 685 if (id < a->count) 686 atomicAdd(&(y->data[a->row_index[id]]), a->data[id] * x->data[a->col_index[id]]); 687 return; 688 } 689 690 int test()// 测试矩阵打印和CPU计算的函数 691 { 692 int i; 693 MAT a, c, x, y; 694 CSR b; 695 ELL d; 696 COO e; 697 698 a.row = 4, a.col = 5; 699 for (i = 0; i < a.row * a.col; i++) 700 a.data[i] = 0; 701 a.data[0] = 3, a.data[2] = 1, a.data[4] = 5, a.data[11] = 2; 702 a.data[12] = 4, a.data[13] = 1, a.data[15] = 1, a.data[18] = 1; 703 704 x.row = a.col, x.col = 1; 705 for (i = 0; i < x.row; i++) 706 x.data[i] = i; 707 708 printMAT(&a); // MAT型 709 710 MATToCSR(&a, &b); // MAT转CSR 711 printCSR(&b); 712 713 CSRToMAT(&b, &c); // CSR转回MAT型 714 printMAT(&c); 715 716 MATToELL(&a, &d); // ELL型 717 printELL(&d); 718 719 MATToCOO(&a, &e); // MAT转COO 720 printCOO(&e); 721 722 COOToMAT(&e, &c); // COO转回MAT 723 printMAT(&c); 724 725 dotMATCPU(&a, &x, &y); // MAT乘法 726 printMAT(&y); 727 728 dotCSRCPU(&b, &x, &y); // CSR乘法 729 printMAT(&y); 730 731 dotELLCPU(&d, &x, &y); // ELL乘法 732 printMAT(&y); 733 734 dotCOOCPU(&e, &x, &y); // COO乘法 735 printMAT(&y); 736 737 getchar(); 738 return 0; 739 } 740 741 int main() 742 { 743 int i; 744 MAT *h_aMAT, *d_aMAT, *h_xMAT, *d_xMAT, *h_yMAT_1, *h_yMAT_2, *d_yMAT; 745 CSR *h_aCSR, *d_aCSR; 746 ELL *h_aELL, *d_aELL; 747 COO *h_aCOO, *d_aCOO; 748 749 clock_t timeCPU; 750 cudaEvent_t startGPU, stopGPU; 751 float timeGPU; 752 cudaEventCreate(&startGPU); 753 cudaEventCreate(&stopGPU); 754 755 printf("\n\tStart! Matrix dimension:\t%d × %d", ROW, COL); 756 757 h_aMAT = initializeMAT(ROW, COL, 1); 758 h_xMAT = initializeMAT(COL, 1, 1); 759 h_yMAT_1 = initializeMAT(ROW, 1, 1); 760 h_yMAT_2 = initializeMAT(ROW, 1, 1); 761 762 // 初始化 763 timeCPU = clock(); 764 srand(SEED); 765 #if INT 766 for (i = 0; i < COL * ROW; i++) 767 h_aMAT->data[i] = ((float)rand() < RAND_MAX * EPSILON) ? (rand() - RAND_MAX / 2) : 0; 768 h_aMAT->count = countMAT(h_aMAT); 769 for (i = 0; i < COL; i++) 770 h_xMAT->data[i] = (rand() - RAND_MAX / 2); 771 //h_xMAT->count = countMAT(h_xMAT); 772 #else 773 for (i = 0; i < COL * ROW; i++) 774 h_aMAT->data[i] = ((float)rand() < RAND_MAX * EPSILON) ? ((float)rand() / RAND_MAX) : 0; 775 h_aMAT->count = countMAT(h_aMAT); 776 for (i = 0; i < COL; i++) 777 h_xMAT->data[i] = ((float)rand() / RAND_MAX); 778 //h_xMAT->count = countMAT(h_xMAT); 779 #endif 780 printf("\n\tInitialized! Time:\t\t%8.3f ms\n", (format)(clock() - timeCPU)); 781 782 //CPU计算部分 783 //MAT 格式 784 timeCPU = clock(); 785 dotMATCPU(h_aMAT, h_xMAT, h_yMAT_1); 786 timeCPU = clock() - timeCPU; 787 printf("\n\tdotMATCPU time:\t%8.3f ms\n", (float)timeCPU); 788 789 // CSR格式 790 h_aCSR = initializeCSR(ROW, COL, h_aMAT->count, 1); 791 MATToCSR(h_aMAT, h_aCSR); 792 timeCPU = clock(); 793 dotCSRCPU(h_aCSR, h_xMAT, h_yMAT_2); 794 timeCPU = clock() - timeCPU; 795 printf("\n\tdotCSRCPU time:\t%8.3f ms\n", (float)timeCPU); 796 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 797 printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n", 798 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 799 else 800 printf("\n\tdotCSRCPU correct!\n"); 801 802 // ELL格式 803 h_aELL = initializeELL(ROW, COL, 1); 804 MATToELL(h_aMAT, h_aELL); 805 timeCPU = clock(); 806 dotELLCPU(h_aELL, h_xMAT, h_yMAT_2); 807 timeCPU = clock() - timeCPU; 808 printf("\n\tdotCELLPU time:\t%8.3f ms\n", (float)timeCPU); 809 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 810 printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n", 811 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 812 else 813 printf("\n\tdotELLCPU correct!\n"); 814 815 // COO格式 816 h_aCOO = initializeCOO(ROW, COL, h_aMAT->count, 1); 817 MATToCOO(h_aMAT, h_aCOO); 818 timeCPU = clock(); 819 dotCOOCPU(h_aCOO, h_xMAT, h_yMAT_2); 820 timeCPU = clock() - timeCPU; 821 printf("\n\tdotCOOCPU time:\t%8.3f ms\n", (float)timeCPU); 822 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 823 printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n", 824 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 825 else 826 printf("\n\tdotCOOCPU correct!\n"); 827 828 // GPU计算部分 829 d_aMAT = initializeMAT(ROW, COL, 0); 830 d_xMAT = initializeMAT(ROW, COL, 0); 831 d_yMAT = initializeMAT(ROW, COL, 0); 832 d_aCSR = initializeCSR(ROW, COL, h_aMAT->count, 0); 833 d_aELL = initializeELL(ROW, COL, 0); 834 d_aCOO = initializeCOO(ROW, COL, h_aMAT->count, 0); 835 836 cudaMemcpy(d_aMAT, h_aMAT, sizeof(MAT), cudaMemcpyHostToDevice); 837 cudaMemcpy(d_xMAT, h_xMAT, sizeof(format) * COL, cudaMemcpyHostToDevice); 838 cudaMemcpy(d_aCSR, h_aCSR, sizeof(CSR), cudaMemcpyHostToDevice); 839 cudaMemcpy(d_aELL, h_aELL, sizeof(CSR), cudaMemcpyHostToDevice); 840 cudaMemcpy(d_aCOO, h_aCOO, sizeof(CSR), cudaMemcpyHostToDevice); 841 842 // MAT格式 843 cudaMemset(d_yMAT, 0, sizeof(MAT)); 844 cudaEventRecord(startGPU, 0); 845 dotMATGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aMAT, d_xMAT, d_yMAT); 846 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 847 cudaDeviceSynchronize(); 848 cudaEventRecord(stopGPU, 0); 849 cudaEventSynchronize(stopGPU); 850 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 851 printf("\n\tdotMATGPU time:\t%8.3f ms\n", timeGPU); 852 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 853 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 854 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 855 else 856 printf("\n\tdotMATGPU correct!\n"); 857 858 // CSR格式 859 cudaMemset(d_yMAT, 0, sizeof(MAT)); 860 cudaEventRecord(startGPU, 0); 861 dotCSRGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aCSR, d_xMAT, d_yMAT); 862 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 863 cudaDeviceSynchronize(); 864 cudaEventRecord(stopGPU, 0); 865 cudaEventSynchronize(stopGPU); 866 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 867 printf("\n\tdotCSRGPU time:\t%8.3f ms\n", timeGPU); 868 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 869 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 870 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 871 else 872 printf("\n\tdotCSRGPU correct!\n"); 873 874 // ELL格式 875 cudaMemset(d_yMAT, 0, sizeof(MAT)); 876 cudaEventRecord(startGPU, 0); 877 dotELLGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aELL, d_xMAT, d_yMAT); 878 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 879 cudaDeviceSynchronize(); 880 cudaEventRecord(stopGPU, 0); 881 cudaEventSynchronize(stopGPU); 882 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 883 printf("\n\tdotELLGPU time:\t%8.3f ms\n", timeGPU); 884 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 885 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 886 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 887 else 888 printf("\n\tdotELLGPU correct!\n"); 889 890 // COO格式 891 cudaMemset(d_yMAT, 0, sizeof(MAT)); 892 cudaEventRecord(startGPU, 0); 893 dotCOOGPU << <CEIL(h_aMAT->count, THREAD_SIZE), THREAD_SIZE >> > (d_aCOO, d_xMAT, d_yMAT); 894 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 895 cudaDeviceSynchronize(); 896 cudaEventRecord(stopGPU, 0); 897 cudaEventSynchronize(stopGPU); 898 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 899 printf("\n\tdotCOOGPU time:\t%8.3f ms\n", timeGPU); 900 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 901 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 902 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 903 else 904 printf("\n\tdotCOOGPU correct!\n"); 905 906 freeMAT(h_aMAT, 1); 907 freeMAT(h_xMAT, 1); 908 freeMAT(h_yMAT_1, 1); 909 freeMAT(h_yMAT_2, 1); 910 freeCSR(h_aCSR, 1); 911 freeELL(h_aELL, 1); 912 freeCOO(h_aCOO, 1); 913 freeMAT(d_aMAT, 0); 914 freeMAT(d_xMAT, 0); 915 freeMAT(d_yMAT, 0); 916 freeCSR(d_aCSR, 0); 917 freeELL(d_aELL, 0); 918 freeCOO(d_aCOO, 0); 919 getchar(); 920 return 0; 921 }
以上是关于各种形式的系数矩阵存储方式及其与一个向量的乘法计算的主要内容,如果未能解决你的问题,请参考以下文章