稀疏矩阵及其压缩格式
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了稀疏矩阵及其压缩格式相关的知识,希望对你有一定的参考价值。
参考技术A 一般情况下,稀疏矩阵指的是元素大部分是0的矩阵(有些资料定义非零元素不超过5%的矩阵,为稀疏矩阵), 矩阵的稀疏性可以用一个分数来量化,即矩阵中零元素的个数除以矩阵中元素的总数。存储稀疏矩阵时只描述其非零元素的值及所在位置, tensorflow的sparse_tensor类型还会存储稀疏矩阵的形状.存储稀疏矩阵时常用的有如下三种压缩格式:
这种存储格式比较简单易懂,每一个元素需要用一个三元组来表示,分别是(行号,列号,数值),对应上图右边的一列。这种方式简单,但是记录单信息多(行列),每个三元组自己可以定位,因此空间不是最优。
这是经常用的一种,我们会经常在一些标准的线性代数库或者数值运算库中看到此方式存储;CSR是比较标准的一种,也需要三类数据来表达:数值,列号,以及行偏移。CSR不是三元组,而是整体的编码方式。数值和列号与COO一致,表示一个元素以及其列号,行偏移表示某一行的第一个元素在values里面的起始偏移位置。如上图中,第一行的第一个元素1在values中是第0个, 所以是0偏移,第二行元素第一个元素2是2偏移,第三行第一个元素5是4偏移,第4行第一个元素6是7偏移。在行偏移的最后补上矩阵总的元素个数,本例中一共是9个非零元素。
CSC是和CSR相对应的一种方式,即按列压缩的意思。
以上图中矩阵为例:
Column Offsets:[0 2 5 7 9]
Row Indices:[0 2 0 1 3 1 2 2 3]
Values: [1 5 7 2 6 8 3 9 4]
Values中的元素要按列写, 跟COO和CSR不同, 指定了Values的元素顺序之后就可以写Row Indices了, 然后根据每一列第一个元素在Values中的位置确定偏移量Column Offsets. 如第一列第一个元素1是0偏移, 第二列第一个元素7是2偏移, 第三列第一个元素8是5偏移, 第四列第一个元素9是7偏移, 共9个元素.
稀疏矩阵 part 5
? 目前为止能跑的所有代码及其结果(2019年2月24日),之后添加:DIA 乘法 GPU 版;其他维度的乘法(矩阵乘矩阵);其他稀疏矩阵格式之间的相互转化
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <memory.h> 4 #include <malloc.h> 5 #include <math.h> 6 #include <time.h> 7 #include "cuda_runtime.h" 8 #include "device_launch_parameters.h" 9 10 #define ROW (8192) 11 #define COL (8192) 12 #define RATIO 0.1 // 系数矩阵非零元素占比 13 #define EPSILON 0.0001 // 浮点数比较 14 #define THREAD_SIZE 256 15 #define SEED 1 // (unsigned int)time(NULL) 16 #define MAX(x,y) (((x)>(y))?(x):(y)) 17 #define CEIL(x,y) (int)(( x - 1 ) / y + 1) 18 #define INT // 数字格式用 int 还是 float 19 20 #ifdef INT 21 typedef int format; 22 #else 23 typedef float format; 24 #endif 25 26 // 矩阵存储格式 27 typedef struct // 顺序格式 28 { 29 int row; // 行数 30 int col; // 列数 31 int count; // 非零元个数(用于转换,不用于计算) 32 format *data; // 元素的值 33 } 34 MAT; 35 36 typedef struct // Compressed Sparse Row 格式 37 { 38 int row; // 行数 39 int col; // 列数 40 format *data; // 元素的值 41 int *index; // 元素的列号 42 int *ptr; // 每行首元在 index 中的下标,最后一个元素的值等于矩阵非零元个数 43 } 44 CSR; 45 46 typedef struct // Compressed Sparse Row 格式 47 { 48 int row; // 行数 49 int col; // 列数 50 format *data; // 元素的值 51 int *index; // 元素的行号 52 int *ptr; // 每列首元在 index 中的下标,最后一个元素的值等于矩阵非零元个数 53 } 54 CSC; 55 56 57 typedef struct // ELLPACK 格式 58 { 59 int row; // 行数 60 int col; // 列数,相当于MAT格式的行数 61 int colOrigin; // 原列数,相当于MAT格式的列数 62 format *data; // 元素的值 63 int *index; // 元素在MAT格式中的列号 64 } 65 ELL; 66 67 typedef struct // Coordinate 格式 68 { 69 int row; // 行数 70 int col; // 列数 71 int count; // 非零元个数 72 int *rowIndex; // 行向量 73 int *colIndex; // 列向量 74 format *data; // 元素的值 75 } 76 COO; 77 78 typedef struct // Diagonal 格式 79 { 80 int row; // 行数 81 int col; // 列数 82 int colOrigin; // 原列数 83 format *data; // 元素的值 84 int *index; // 原矩阵各对角线是否非零 85 } 86 DIA; 87 88 // 全局指针 89 __managed__ MAT *aMAT, *xMAT, *yMATRef, *yMATCal; 90 __managed__ CSR *aCSR; 91 __managed__ ELL *aELL; 92 __managed__ COO *aCOO; 93 94 // 通用函数 95 __host__ __device__ inline void checkNULL(const void *input) // 有点问题,设备函数无法使用 exit(1) 来推出 96 { 97 if (input == NULL) 98 printf(" find a NULL!"); 99 return; 100 } 101 102 __host__ inline void checkCudaError(cudaError input) 103 { 104 if (input != cudaSuccess) 105 { 106 printf(" find a cudaError!"); 107 exit(1); 108 } 109 return; 110 } 111 112 int checkEqual(const format * in1, const format * in2, const int length)// 注意两向量相等时返 0,否则返回“值不等的元素下标加一” 113 { 114 int i; 115 for (i = 0; i < length; i++) 116 { 117 #ifdef INT 118 if (in1[i] != in2[i]) 119 #else 120 if (fabs(in1[i] - in2[i]) > EPSILON) 121 #endif 122 { 123 printf(" Error at i = %d in1[i] = %10f, in2[i] = %10f ", i, (float)in1[i], (float)in2[i]); 124 return i + 1; 125 } 126 } 127 printf(" Check output, passed. "); 128 return 0; 129 } 130 131 // 打印矩阵 132 void print(const char* info, const MAT *in)// 打印MAT格式的矩阵 133 { 134 printf("%s MAT, row = %d, col = %d, count = %d", info, in->row, in->col, in->count); 135 printf(" data = "); 136 for (int i = 0; i < in->row * in->col; i++) 137 { 138 #ifdef INT 139 printf("%d ", in->data[i]); 140 #else 141 printf("%.2f ", in->data[i]); 142 #endif 143 if ((i + 1) % in->col == 0) 144 printf(" "); 145 } 146 printf(" "); 147 return; 148 } 149 150 void print(const char* info, const CSR *in)// 打印CSR格式的矩阵 151 { 152 printf("%s CSR, row = %d, col = %d", info, in->row, in->col); 153 printf(" data = "); 154 for (int i = 0; i < in->ptr[in->row]; i++) 155 #ifdef INT 156 printf("%d ", in->data[i]); 157 #else 158 printf("%.2f ", in->data[i]); 159 #endif 160 printf(" index = "); 161 for (int i = 0; i < in->ptr[in->row]; i++) 162 printf("%d ", in->index[i]); 163 printf(" ptr = "); 164 for (int i = 0; i < in->row + 1; i++) 165 printf("%d ", in->ptr[i]); 166 printf(" "); 167 return; 168 } 169 170 void print(const char* info, const ELL *in)// 打印ELL格式的矩阵 171 { 172 int i; 173 printf("%s ELL, row = %d, col = %d, colOrigin = %d", info, in->row, in->col, in->colOrigin); 174 printf(" data = "); 175 for (i = 0; i < in->row * in->col; i++) 176 { 177 #ifdef INT 178 printf("%d ", in->data[i]); 179 #else 180 printf("%.2f ", in->data[i]); 181 #endif 182 if ((i + 1) % in->col == 0) 183 printf(" "); 184 } 185 printf("index = "); 186 for (i = 0; i < in->row * in->col; i++) 187 { 188 printf("%d ", in->index[i]); 189 if ((i + 1) % in->col == 0) 190 printf(" "); 191 } 192 printf(" "); 193 return; 194 } 195 196 void print(const char* info, const COO *in)// 打印COO格式的矩阵 197 { 198 int i; 199 printf("%s COO, row = %d, col = %d, count = %d", info, in->row, in->col, in->count); 200 printf(" (rowIndex, colIndex, data) = "); 201 for (i = 0; i < in->count; i++) 202 { 203 #ifdef INT 204 printf("(%d,%d,%d) ", in->rowIndex[i], in->colIndex[i], in->data[i]); 205 #else 206 printf("(%d,%d,%.2f) ", in->rowIndex[i], in->colIndex[i], in->data[i]); 207 #endif 208 } 209 printf(" "); 210 return; 211 } 212 213 void print(const char* info, const DIA *in)// 打印DIA格式的矩阵 214 { 215 int i; 216 printf("%s DIA, row = %d, col = %d, colOrigin = %d", info, in->row, in->col, in->colOrigin,in->colOrigin); 217 printf(" data = "); 218 int * inverseIndex = (int *)malloc(sizeof(int) * in->col); 219 for (int i = 0, j = 0; i < in->row + in->col - 1; i++) 220 { 221 if (in->index[i] == 1) 222 { 223 inverseIndex[j] = i; 224 j++; 225 } 226 } 227 for (i = 0; i < in->row * in->col; i++) 228 { 229 if (i / in->col < in->row - 1 - inverseIndex[i % in->col] || i / in->col > inverseIndex[in->col - 1] - inverseIndex[i % in->col]) 230 printf("* "); 231 else 232 #ifdef INT 233 printf("%d ", in->data[i]); 234 #else 235 printf("%.2f ", in->data[i]); 236 #endif 237 if ((i + 1) % in->col == 0) 238 printf(" "); 239 } 240 printf("index = "); 241 for (i = 0; i < in->row + in->col - 1; i++) 242 printf("%d ", in->index[i]); 243 printf(" "); 244 free(inverseIndex); 245 return; 246 } 247 248 // 矩阵初始化与清理 249 MAT *initializeMAT(const int row, const int col, const int count = 0)// 初始化MAT,注意非零元默认为 0 250 { 251 MAT *temp; 252 checkCudaError(cudaMallocManaged((void **)&temp, sizeof(MAT))); 253 temp->row = row; 254 temp->col = col; 255 temp->count = count; 256 checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col)); 257 return temp; 258 } 259 260 // 统计MAT形式的矩阵中的非零元素个数 261 #define COUNT_MAT(in) 262 { 263 checkNULL(in); 264 int i, zero; 265 for (i = zero = 0; i < in->row * in->col; i++) 266 zero += !in->data[i]; 267 in->count = in->row * in->col - zero; 268 } 269 270 int freeMatrix(MAT *in)// 释放MAT 271 { 272 checkNULL(in); 273 cudaFree(in->data); 274 cudaFree(in); 275 return 0; 276 } 277 278 CSR * initializeCSR(const int row, const int col, const int count)// 初始化CSR,要求给出 count 279 { 280 CSR *temp; 281 checkCudaError(cudaMallocManaged((void **)&temp, sizeof(CSR))); 282 temp->row = row; 283 temp->col = col; 284 checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * count)); 285 checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(int) * count)); 286 checkCudaError(cudaMallocManaged((void **)&temp->ptr, sizeof(int) * (row + 1))); 287 return temp; 288 } 289 290 int freeMatrix(CSR *in)// 释放CSR 291 { 292 checkNULL(in); 293 cudaFree(in->data); 294 cudaFree(in->index); 295 cudaFree(in->ptr); 296 cudaFree(in); 297 return 0; 298 } 299 300 ELL * initializeELL(const int row, const int col, const int colOrigin)// 初始化ELL,要求给出原列数 301 { 302 ELL *temp; 303 checkCudaError(cudaMallocManaged((void **)&temp, sizeof(ELL))); 304 temp->row = row; 305 temp->col = col; 306 temp->colOrigin = colOrigin; 307 checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col)); 308 checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(int) * row * col)); 309 return temp; 310 } 311 312 int freeMatrix(ELL *in)// 释放ELL 313 { 314 cudaFree(in->data); 315 cudaFree(in->index); 316 cudaFree(in); 317 return 0; 318 } 319 320 COO * initializeCOO(const int row, const int col, const int count)// 初始化COO,要求给出 count 321 { 322 COO * temp; 323 checkCudaError(cudaMallocManaged((void **)&temp, sizeof(COO))); 324 temp->row = row; 325 temp->col = col; 326 temp->count = count; 327 checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * count)); 328 checkCudaError(cudaMallocManaged((void **)&temp->rowIndex, sizeof(int) * count)); 329 checkCudaError(cudaMallocManaged((void **)&temp->colIndex, sizeof(int) * count)); 330 return temp; 331 } 332 333 int freeMatrix(COO *in)// 释放COO 334 { 335 checkNULL(in); 336 cudaFree(in->data); 337 cudaFree(in->rowIndex); 338 cudaFree(in->colIndex); 339 cudaFree(in); 340 return 0; 341 } 342 343 DIA * initializeDIA(const int row, const int col, const int colOrigin)// 初始化DIA,要求给出原行数、原列数 344 { 345 DIA * temp; 346 checkCudaError(cudaMallocManaged((void **)&temp, sizeof(DIA))); 347 temp->row = row; 348 temp->col = col; 349 temp->colOrigin = colOrigin; 350 checkCudaError(cudaMallocManaged((void **)&temp->data, sizeof(format) * row * col)); 351 checkCudaError(cudaMallocManaged((void **)&temp->index, sizeof(format) * (row + col - 1))); 352 return temp; 353 } 354 355 int freeMatrix(DIA *in)// 释放DIA 356 { 357 checkNULL(in); 358 cudaFree(in->data); 359 cudaFree(in->index); 360 cudaFree(in); 361 return 0; 362 } 363 364 // 矩阵格式的相互转化 365 CSR * MATToCSR(const MAT *in) // MAT 转 CSR 366 { 367 checkNULL(in); 368 CSR * out = initializeCSR(in->row, in->col, in->count); 369 checkNULL(out); 370 371 out->ptr[0] = 0; 372 for (int i = 0, j = 0, k = 1; i < in->row * in->col; i++) // i 遍历 in->data 373 { 374 if (in->data[i] != 0) // 找到非零元 375 { 376 if (j == in->count) // 在 out->data 已经填满了的基础上又发现了非零元,错误 377 return NULL; 378 out->data[j] = in->data[i]; // 填充非零元素 379 out->index[j] = i % in->col; // 填充列号 380 j++; 381 } 382 if ((i + 1) % in->col == 0) // 到了最后一列,写入行指针号 383 out->ptr[k++] = j; 384 } 385 return out; 386 } 387 388 MAT * CSRToMAT(const CSR *in) // CSR转MAT 389 { 390 checkNULL(in); 391 MAT *out = initializeMAT(in->row, in->col, in->ptr[in->row]); 392 checkNULL(out); 393 394 memset(out->data, 0, sizeof(format) * in->row * in->col); 395 for (int i = 0; i < in->row; i++) // i 遍历行 396 { 397 for (int j = in->ptr[i]; j < in->ptr[i + 1]; j++) // j 遍历列 398 out->data[i * in->col + in->index[j]] = in->data[j]; 399 } 400 return out; 401 } 402 403 ELL * MATToELL(const MAT *in)// MAT转ELL 404 { 405 checkNULL(in); 406 407 int i, j, maxElement; 408 for (i = j = maxElement = 0; i < in->row * in->col; i++) // i 遍历 in->data,j 记录该行非零元素数,maxElement 记录一行非零元素最大值 409 { 410 if (in->data[i] != 0) // 找到非零元 411 j++; 412 if ((i + 1) % in->col == 0) // 行末,更新 maxElement 413 { 414 maxElement = MAX(j, maxElement); 415 j = 0; // 开始下一行之前清空 j 416 } 417 } 418 format* temp_data=(format *)malloc(sizeof(format) * in->row * maxElement); // 临时数组,将列数压缩到 maxElement 419 checkNULL(temp_data); 420 int* temp_index = (int *)malloc(sizeof(int) * in->row * maxElement); 421 checkNULL(temp_index); 422 memset(temp_data, 0, sizeof(format) * in->row * maxElement); 423 memset(temp_index, 0, sizeof(int) * in->row * maxElement); 424 for (i = j = 0; i < in->row * in->col; i++) // i 遍历 in->data,j 记录该行非零元素数,把 in 中每行的元素往左边推 425 { 426 if (in->data[i] != 0) // 找到非零元 427 { 428 temp_data[i / in->col * maxElement + j] = in->data[i]; // 存放元素 429 temp_index[i / in->col * maxElement + j] = i % in->col; // 记录所在的列号 430 j++; 431 } 432 if ((i + 1) % in->col == 0) // 行末,将剩余位置的下标记作 -1,即无效元素 433 { 434 for (j += i / in->col * in->col; j < maxElement * (i / in->col + 1); j++) // 使得 j 指向本行最后一个非零元素之后的元素,再开始填充 435 temp_index[j] = -1; 436 j = 0; // 开始下一行之前清空 j 437 } 438 } 439 ELL *out = initializeELL(maxElement, in->row, in->col); // 最终输出,如果不转置的话不要这部分 440 checkNULL(out); 441 for (i = 0; i < out->row * out->col; i++) // 将 temp_data 和 temp_index 转置以提高缓存利用 442 { 443 out->data[i] = temp_data[i % out->col * out->row + i / out->col]; 444 out->index[i] = temp_index[i % out->col * out->row + i / out->col]; 445 } 446 free(temp_data); 447 free(temp_index); 448 return out; 449 } 450 451 MAT * ELLToMAT(const ELL *in) // ELL转MAT 452 { 453 checkNULL(in); 454 MAT *out = initializeMAT(in->col, in->colOrigin); 455 checkNULL(out); 456 457 for (int i = 0; i < in->row * in->col; i++) // i 遍历 out->data 458 { 459 if (in->index[i] < 0) // 注意跳过无效元素 460 continue; 461 out->data[i % in->col * in->colOrigin + in->index[i]] = in->data[i]; 462 } 463 COUNT_MAT(out); 464 return out; 465 } 466 467 COO * MATToCOO(const MAT *in) // MAT转COO 468 { 469 checkNULL(in); 470 COO *out = initializeCOO(in->row, in->col, in->count); 471 472 for (int i=0, j = 0; i < in->row * in->col; i++) 473 { 474 #ifdef INT 475 if (in->data[i] != 0) 476 #else 477 if (fabs(in->data[i]) > EPSILON) 478 #endif 479 { 480 out->data[j] = in->data[i]; 481 out->rowIndex[j] = i / in->col; 482 out->colIndex[j] = i % in->col; 483 j++; 484 } 485 } 486 return out; 487 } 488 489 MAT * COOToMAT(const COO *in) // COO转MAT 490 { 491 checkNULL(in); 492 MAT *out = initializeMAT(in->row, in->col, in->count); 493 checkNULL(out); 494 495 for (int i = 0; i < in->row * in->col; i++) 496 out->data[i] = 0; 497 for (int i = 0; i < in->count; i++) 498 out->data[in->rowIndex[i] * in->col + in->colIndex[i]] = in->data[i]; 499 return out; 500 } 501 502 DIA * MATToDIA(const MAT *in) // MAT转DIA 503 { 504 checkNULL(in); 505 506 int *index = (int *)malloc(sizeof(int)*(in->row + in->col - 1)); 507 for (int diff = in->row - 1; diff > 0; diff--) // 左侧零对角线情况 508 { 509 int flagNonZero = 0; 510 for (int i = 0; i < in->col && i + diff < in->row; i++) // i 沿着对角线方向遍历 in->data,flagNonZero 记录该对角线是否全部为零元 511 { 512 #ifdef INT 513 if (in->data[(i + diff) * in->col + i] != 0) 514 #else 515 if (fabs(in->data[(i + diff) * in->col + i]) > EPSILON) 516 #endif 517 flagNonZero = 1; 518 } 519 index[in->row - 1 - diff] = flagNonZero; // 标记该对角线上有非零元 520 } 521 for (int diff = in->col - 1; diff >= 0; diff--) // 右侧零对角线情况 522 { 523 int flagNonZero = 0; 524 for (int j = 0; j < in->row && j + diff < in->col; j++) 525 { 526 #ifdef INT 527 if (in->data[j * in->col + j + diff] != 0) 528 #else 529 if (fabs(in->data[j * in->col + j + diff]) > EPSILON) 530 #endif 531 flagNonZero = 1; 532 } 533 index[in->row - 1 + diff] = flagNonZero; // 标记该对角线上有非零元 534 } 535 int *prefixSumIndex = (int *)malloc(sizeof(int)*(in->row + in->col - 1)); 536 prefixSumIndex[0] = index[0]; 537 for (int i = 1; i < in->row + in->col - 1; i++) // 闭前缀和,prefixSumIndex[i] 表示原矩阵第 0 ~ i 条对角线中共有多少条非零对角线(含) 538 prefixSumIndex[i] = prefixSumIndex[i-1] + index[i]; // index[in->row + in->col -2] 表示原矩阵非零对角线条数,等于 DIA 矩阵列数 539 DIA *out = initializeDIA(in->row, prefixSumIndex[in->row + in->col - 2], in->col); 540 checkNULL(out); 541 542 memset(out->data, 0, sizeof(int)*out->row * out->col); 543 for (int i = 0; i < in->row + in->col - 1; i++) 544 out->index[i] = index[i]; // index 搬进 out 545 for (int i = 0; i < in->row; i++) // i,j 遍历原矩阵,将元素搬进 out 546 { 547 for (int j = 0; j < in->col; j++) 548 { 549 int temp = j - i + in->row - 1; 550 if (index[temp] == 0) 551 continue; 552 out->data[i * out->col + (temp > 0 ? prefixSumIndex[temp - 1] : 0)] = in->data[i * in->col + j]; // 第 row - 1 行第 0 列元素 temp == 0,单独处理 553 } 554 } 555 free(index); 556 free(prefixSumIndex); 557 return out; 558 } 559 560 MAT * DIAToMAT(const DIA *in) // DIA转MAT 561 { 562 checkNULL(in); 563 MAT *out = initializeMAT(in->row, in->colOrigin); 564 checkNULL(out); 565 566 int * inverseIndex = (int *)malloc(sizeof(int) * in->col); 567 for (int i = 0, j = 0; i < in->row + in->col - 1; i++) // 求一个 index 的逆,即 DIA 中第 i 列对应原矩阵第 inverseIndex[i] 对角线 568 { // 原矩阵对角线编号 (row-1, 0) 为第 0 条,(0, 0) 为第 row - 1 条,(col-1, 0) 为第 row + col - 2 条 569 if (in->index[i] == 1) 570 { 571 inverseIndex[j] = i; 572 j++; 573 } 574 } 575 for (int i = 0; i < in->row; i++) // i 遍历 in->data 行,j 遍历 in->data 列 576 { 577 for (int j = 0; j < in->col; j++) 578 { 579 if (i < in->row - 1 - inverseIndex[j] || i > inverseIndex[in->col - 1] - inverseIndex[j]) // 跳过两边呈三角形的无效元素 580 continue; 581 out->data[i * in->col + inverseIndex[j] - in->row + 1] = in->data[i * in->col + j]; // 利用 inverseIndex 来找钙元素在原距震中的位置 582 } 583 } 584 free(inverseIndex); 585 return out; 586 } 587 588 // 各种形式的矩阵和一个向量的乘法 589 int dotCPU(const MAT *a, const MAT *x, MAT *y) // CPU MAT 乘法 590 { 591 checkNULL(a); checkNULL(x); checkNULL(y); 592 if (a->col != x->row) 593 { 594 printf("dotMATCPU dimension mismatch! "); 595 return 1; 596 } 597 598 y->row = a->row; 599 y->col = x->col; 600 for (int i = 0; i < a->row; i++) 601 { 602 format sum = 0; 603 for (int j = 0; j < a->col; j++) 604 sum += a->data[i * a->col + j] * x->data[j]; 605 y->data[i] = sum; 606 } 607 COUNT_MAT(y); 608 return 0; 609 } 610 611 int dotCPU(const CSR *a, const MAT *x, MAT *y) // CPU CSR 乘法 612 { 613 checkNULL(a); checkNULL(x); checkNULL(y); 614 if (a->col != x->row) 615 { 616 printf("dotCSRCPU dimension mismatch! "); 617 return 1; 618 } 619 620 y->row = a->row; 621 y->col = x->col; 622 for (int i = 0; i < a->row; i++) // i 遍历 ptr,j 遍历行内数据,A 中为 0 的元素不参加乘法 623 { 624 format sum = 0; 625 for (int j = a->ptr[i]; j < a->ptr[i + 1]; j++) 626 sum += a->data[j] * x->data[a->index[j]]; 627 y->data[i] = sum; 628 } 629 COUNT_MAT(y); 630 return 0; 631 } 632 633 int dotCPU(const ELL *a, const MAT *x, MAT *y) // CPU ELL乘法 634 { 635 checkNULL(a); checkNULL(x); checkNULL(y); 636 if (a->colOrigin != x->row) 637 { 638 printf("dotELLCPU dimension mismatch! "); 639 return 1; 640 } 641 642 y->row = a->col; 643 y->col = x->col; 644 for (int i = 0; i<a->col; i++) 645 { 646 format sum = 0; 647 for (int j = 0; j < a->row; j++) 648 { 649 int temp = a->index[j * a->col + i]; 650 if (temp < 0) // 跳过无效元素 651 continue; 652 sum += a->data[j * a->col + i] * x->data[temp]; 653 } 654 y->data[i] = sum; 655 } 656 COUNT_MAT(y); 657 return 0; 658 } 659 660 int dotCPU(const COO *a, const MAT *x, MAT *y)// CPU COO乘法 661 { 662 checkNULL(a); checkNULL(x); checkNULL(y); 663 if (a->col != x->row) 664 { 665 printf("dotCOOCPU null! "); 666 return 1; 667 } 668 669 y->row = a->row; 670 y->col = x->col; 671 for (int i = 0; i<a->count; i++) 672 y->data[a->rowIndex[i]] += a->data[i] * x->data[a->colIndex[i]]; 673 COUNT_MAT(y); 674 return 0; 675 } 676 677 int dotCPU(const DIA *a, const MAT *x, MAT *y)// CPU DIA乘法 678 { 679 checkNULL(a); checkNULL(x); checkNULL(y); 680 if (a->colOrigin != x->row) 681 { 682 printf("dotDIACPU null! "); 683 return 1; 684 } 685 y->row = a->row; 686 y->col = x->col; 687 int * inverseIndex = (int *)malloc(sizeof(int) * a->col); 688 for (int i = 0, j = 0; i < a->row + a->col - 1; i++) 689 { 690 if (a->index[i] == 1) 691 { 692 inverseIndex[j] = i; 693 j++; 694 } 695 } 696 for (int i = 0; i < a->row; i++) 697 { 698 format sum = 0; 699 for (int j = 0; j < a->col; j++) 700 { 701 if (i < a->row - 1 - inverseIndex[j] || i > inverseIndex[a->col - 1] - inverseIndex[j]) 702 continue; 703 sum += a->data[i * a->col + j] * x->data[i + inverseIndex[j] - a->row + 1]; 704 } 705 y->data[i] = sum; 706 } 707 COUNT_MAT(y); 708 free(inverseIndex); 709 return 0; 710 } 711 712 __global__ void dotGPU(const MAT *a, const MAT *x, MAT *y)// GPU MAT乘法 713 { 714 int id = blockIdx.x * blockDim.x + threadIdx.x; 715 if (id < a->row) 716 { 717 format sum = 0; 718 for (int i = 0; i < a->col; i++) 719 sum += a->data[id * a->col + i] * x->data[i]; 720 y->data[id] = sum; 721 } 722 if (id == 0) 723 { 724 y->row = a->row; 725 y->col = x->col; 726 COUNT_MAT(y); 727 } 728 return; 729 } 730 731 __global__ void dotGPU(const CSR *a, const MAT *x, MAT *y)// GPU CSR乘法 732 { 733 int id = blockIdx.x * blockDim.x + threadIdx.x; 734 if (id < a->row) 735 { 736 format sum = 0; 737 for (int j = a->ptr[id]; j < a->ptr[id + 1]; j++) 738 sum += a->data[j] * x->data[a->index[j]]; 739 y->data[id] = sum; 740 } 741 if (id == 0) 742 { 743 y->row = a->row; 744 y->col = x->col; 745 COUNT_MAT(y); 746 } 747 return; 748 } 749 750 __global__ void dotGPU(const ELL *a, const MAT *x, MAT *y)// GPU ELL乘法 751 { 752 int id = blockIdx.x * blockDim.x + threadIdx.x; 753 if (id < a->col) 754 { 755 format sum = 0; 756 for (int j = 0; j < a->row; j++) 757 sum += a->data[id + j * a->col] * (a->index[id + j * a->col] < 0 ? 0 : x->data[a->index[id + j * a->col]]); 758 y->data[id] = sum; 759 } 760 if (id == 0) 761 { 762 y->row = a->col; 763 y->col = x->col; 764 COUNT_MAT(y); 765 } 766 return; 767 } 768 769 __global__ void dotGPU(const COO *a, const MAT *x, MAT *y)// GPU COO乘法 770 { 771 int id = blockIdx.x * blockDim.x + threadIdx.x; 772 if (id < a->count) 773 atomicAdd(&(y->data[a->rowIndex[id]]), a->data[id] * x->data[a->colIndex[id]]); 774 if (id == 0) 775 { 776 y->row = a->row; 777 y->col = x->col; 778 COUNT_MAT(y); 779 } 780 return; 781 } 782 783 int test()// 测试矩阵打印和CPU计算的函数 784 { 785 const int row = 4, col = 5; 786 787 MAT* a = initializeMAT(row, col); 788 a->data[0] = 3, a->data[2] = 1, a->data[4] = 5, a->data[11] = 2; 789 a->data[12] = 4, a->data[13] = 1, a->data[15] = 1, a->data[18] = 1; 790 COUNT_MAT(a); 791 792 MAT* x = initializeMAT(col, 1); 793 for (int i = 0; i < x->row; i++) 794 x->data[i] = i + 1; 795 COUNT_MAT(x); 796 797 MAT* y = initializeMAT(row, 1); 798 COUNT_MAT(y); 799 800 print("MAT a =", a); 801 print("MAT x =", x); 802 dotCPU(a, x, y); 803 print("MAT y = a * x = ", y); 804 805 CSR* b = MATToCSR(a); 806 print("CSR b = a = ", b); 807 memset(y->data, 0, sizeof(format) * y->row * y->col); 808 dotCPU(b, x, y); 809 print("MAT y = b * x = ", y); 810 MAT* c = CSRToMAT(b); 811 print("MAT c = b =", c); 812 freeMatrix(b); 813 814 ELL* d = MATToELL(a); 815 print("ELL d = a =", d); 816 memset(y->data, 0, sizeof(format) * y->row * y->col); 817 dotCPU(d, x, y); 818 print("MAT y = d * x =", y); 819 c = ELLToMAT(d); 820 print("MAT c = d =", c); 821 freeMatrix(d); 822 823 COO* e = MATToCOO(a); 824 print("ELL e = a = ", e); 825 memset(y->data, 0, sizeof(format) * y->row * y->col); 826 dotCPU(e, x, y); 827 print("MAT y = e * x = ", y); 828 c = COOToMAT(e); 829 print("MAT c = e =", c); 830 freeMatrix(e); 831 832 DIA* f = MATToDIA(a); 833 print("DIA f = a = ", f); 834 memset(y->data, 0, sizeof(format) * y->row * y->col); 835 dotCPU(f, x, y); 836 print("MAT y = f * x = ", y); 837 c = DIAToMAT(f); 838 print("MAT c = f =", c); 839 freeMatrix(f); 840 841 freeMatrix(a); 842 freeMatrix(x); 843 freeMatrix(y); 844 freeMatrix(c); 845 return 0; 846 } 847 848 int main() 849 { 850 test(); 851 852 clock_t timeCPU; 853 cudaEvent_t startGPU, stopGPU; 854 float timeGPU; 855 cudaEventCreate(&startGPU); 856 cudaEventCreate(&stopGPU); 857 858 printf(" Start. Matrix dimension: %d × %d", ROW, COL); 859 860 // 初始化 861 timeCPU = clock(); 862 aMAT = initializeMAT(ROW, COL); 863 xMAT = initializeMAT(COL, 1); 864 yMATRef = initializeMAT(ROW, 1); 865 yMATCal = initializeMAT(ROW, 1); 866 867 srand(SEED); 868 #ifdef INT 869 for (int i = 0; i < COL * ROW; i++) 870 aMAT->data[i] = ((float)rand() < RAND_MAX * RATIO) ? (rand() - RAND_MAX / 2) % 32 : 0; 871 COUNT_MAT(aMAT); 872 int count = aMAT->count; 873 for (int i = 0; i < COL; i++) 874 xMAT->data[i] = i % 32; 875 COUNT_MAT(xMAT); 876 #else 877 for (int i = 0; i < COL * ROW; i++) 878 aMAT->data[i] = ((float)rand() < RAND_MAX * RATIO) ? ((float)rand() / RAND_MAX) : 0; 879 aMAT->count = countMAT(aMAT); 880 for (int i = 0; i < COL; i++) 881 xMAT->data[i] = ((float)rand() / RAND_MAX); 882 xMAT->count = countMAT(xMAT); 883 #endif 884 printf(" Initialized. Time: %d ms ", clock() - timeCPU); 885 886 //CPU计算部分 887 //MAT 格式 888 timeCPU = clock(); 889 dotCPU(aMAT, xMAT, yMATRef); 890 timeCPU = clock() - timeCPU; 891 printf(" dotMATCPU time: %8.3f ms ", (float)timeCPU); 892 893 // CSR格式 894 aCSR = MATToCSR(aMAT); 895 timeCPU = clock(); 896 dotCPU(aCSR, xMAT, yMATCal); 897 timeCPU = clock() - timeCPU; 898 printf(" dotCSRCPU time: %8.3f ms ", (float)timeCPU); 899 checkEqual(yMATRef->data, yMATCal->data, ROW); 900 901 // ELL格式 902 aELL = MATToELL(aMAT); 903 timeCPU = clock(); 904 dotCPU(aELL, xMAT, yMATCal); 905 timeCPU = clock() - timeCPU; 906 printf(" dotELLCPU time: %8.3f ms ", (float)timeCPU); 907 checkEqual(yMATRef->data, yMATCal->data, ROW); 908 909 // COO格式 910 aCOO = MATToCOO(aMAT); 911 timeCPU = clock(); 912 dotCPU(aCOO, xMAT, yMATCal); 913 timeCPU = clock() - timeCPU; 914 printf(" dotCOOCPU time: %8.3f ms ", (float)timeCPU); 915 checkEqual(yMATRef->data, yMATCal->data, ROW); 916 917 // GPU计算部分 918 // MAT格式 919 cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col); 920 yMATCal->count = 0; 921 cudaEventRecord(startGPU, 0); 922 dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aMAT, xMAT, yMATCal); 923 cudaDeviceSynchronize(); 924 cudaEventRecord(stopGPU, 0); 925 cudaEventSynchronize(stopGPU); 926 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 927 printf(" dotMATGPU time: %8.3f ms ", timeGPU); 928 checkEqual(yMATRef->data, yMATCal->data, ROW); 929 freeMatrix(aMAT); 930 931 // CSR格式 932 cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col); 933 yMATCal->count = 0; 934 cudaEventRecord(startGPU, 0); 935 dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aCSR, xMAT, yMATCal); 936 cudaDeviceSynchronize(); 937 cudaEventRecord(stopGPU, 0); 938 cudaEventSynchronize(stopGPU); 939 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 940 printf(" dotCSRGPU time: %8.3f ms ", timeGPU); 941 checkEqual(yMATRef->data, yMATCal->data, ROW); 942 freeMatrix(aCSR); 943 944 // ELL格式 945 cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col); 946 yMATCal->count = 0; 947 cudaEventRecord(startGPU, 0); 948 dotGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (aELL, xMAT, yMATCal); 949 cudaDeviceSynchronize(); 950 cudaEventRecord(stopGPU, 0); 951 cudaEventSynchronize(stopGPU); 952 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 953 printf(" dotELLGPU time: %8.3f ms ", timeGPU); 954 checkEqual(yMATRef->data, yMATCal->data, ROW); 955 freeMatrix(aELL); 956 957 // COO格式 958 cudaMemset(yMATCal->data, 0, sizeof(format) * yMATCal->row * yMATCal->col); 959 yMATCal->count = 0; 960 cudaEventRecord(startGPU, 0); 961 dotGPU << <CEIL(count, THREAD_SIZE), THREAD_SIZE >> > (aCOO, xMAT, yMATCal); 962 cudaDeviceSynchronize(); 963 cudaEventRecord(stopGPU, 0); 964 cudaEventSynchronize(stopGPU); 965 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 966 printf(" dotCOOGPU time: %8.3f ms ", timeGPU); 967 checkEqual(yMATRef->data, yMATCal->data, ROW); 968 freeMatrix(aCOO); 969 970 // 清理内存 971 freeMatrix(xMAT); 972 freeMatrix(yMATRef); 973 freeMatrix(yMATCal); 974 975 getchar(); 976 return 0; 977 }
● 输出结果
MAT a = MAT, row = 4, col = 5, count = 8 3 0 1 0 5 0 0 0 0 0 0 2 4 1 0 1 0 0 1 0 MAT x = MAT, row = 5, col = 1, count = 5 1 2 3 4 5 MAT y = a * x = MAT, row = 4, col = 1, count = 3 31 0 20 5 CSR b = a = CSR, row = 4, col = 5 3 1 5 2 4 1 1 1 0 2 4 1 2 3 0 3 0 3 3 6 8 MAT y = b * x = MAT, row = 4, col = 1, count = 3 31 0 20 5 MAT c = b = MAT, row = 4, col = 5, count = 8 3 0 1 0 5 0 0 0 0 0 0 2 4 1 0 1 0 0 1 0 ELL d = a = ELL, row = 3, col = 4, colOrigin = 5 3 0 2 1 1 0 4 1 5 0 1 0 0 0 1 0 2 0 2 3 4 -1 3 0 MAT y = d * x = MAT, row = 4, col = 1, count = 3 31 0 20 5 MAT c = d = MAT, row = 4, col = 5, count = 7 3 0 1 0 5 0 0 0 0 0 0 2 4 1 0 0 0 0 1 0 ELL e = a = COO, row = 4, col = 5, count = 8 (0,0,3) (0,2,1) (0,4,5) (2,1,2) (2,2,4) (2,3,1) (3,0,1) (3,3,1) MAT y = e * x MAT, row = 4, col = 1, count = 3 31 0 20 5 MAT c = e = MAT, row = 4, col = 5, count = 8 3 0 1 0 5 0 0 0 0 0 0 2 4 1 0 1 0 0 1 0 Start. Matrix dimension: 8192 × 8192 Initialized. Time: 0.000 ms dotMATCPU time: 304.000 ms dotCSRCPU time: 3.000 ms Check output, passed. dotCELLPU time: 103.000 ms Check output, passed. dotCOOCPU time: 11.000 ms Check output, passed. dotMATGPU time: 5.133 ms Check output, passed. dotCSRGPU time: 2.263 ms Check output, passed. dotELLGPU time: 1.253 ms Check output, passed. dotCOOGPU time: 4.754 ms Check output, passed.
以上是关于稀疏矩阵及其压缩格式的主要内容,如果未能解决你的问题,请参考以下文章