CUDA跑MNIST,加速
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA跑MNIST,加速相关的知识,希望对你有一定的参考价值。
之前用CUDA写的版本,竟然还不如CPU。
经过好几次的尝试,得到两点经验:
1. CUDA的kernel不需要写得硕大无比。写大了之后,block数和thread数反而不好调整(之前都没有用上block),另外就是会导致数据管理非常复杂。kernel搞成细粒度之后好像也没那么多的影响。
2. 训练还得batch的来,最重要的影响就是能极大的增加数据并行性。
下面的代码,GPU vs CPU达到了接近20倍的加速,代码也不复杂。
1 #include <iostream> 2 #include <cstdlib> 3 #include <cassert> 4 #include <string> 5 #include <cstring> 6 #include <fstream> 7 #include <vector> 8 #include <memory> 9 #include <cstdlib> 10 #include <cmath> 11 #include <ctime> 12 using namespace std; 13 14 #define CPU 0 15 #define GPU 1 16 17 #ifndef GPU 18 #define __host__ 19 #define __device__ 20 #define expf exp 21 #define logf log 22 #endif 23 24 #ifdef GPU 25 #include <cuda_runtime.h> 26 #include <math_functions.h> 27 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo) 28 { 29 if(code == cudaSuccess) return; 30 cerr << "Cuda call failed at " << fileName << ":" << lineNo 31 << " " << cudaGetErrorString(code) << endl; 32 exit(-1); 33 } 34 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__) 35 36 37 inline size_t get_thread_cnt(size_t size) 38 { 39 size = min(size, 1024UL); 40 41 // ceil(size / 32) 42 size += 31; 43 size &= ~31; 44 return size; 45 } 46 47 inline size_t ceil(size_t x, size_t y) 48 { 49 return (x + y - 1) / y; 50 } 51 #endif 52 53 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html 54 // 高斯分布的随机数,均值为0,方差为1 55 double gaussrand() 56 { 57 static double V1, V2, S; 58 static int phase = 0; 59 double X; 60 61 if ( phase == 0 ) { 62 do { 63 double U1 = (double)rand() / RAND_MAX; 64 double U2 = (double)rand() / RAND_MAX; 65 66 V1 = 2 * U1 - 1; 67 V2 = 2 * U2 - 1; 68 S = V1 * V1 + V2 * V2; 69 } while(S >= 1 || S == 0); 70 71 X = V1 * sqrt(-2 * log(S) / S); 72 } else 73 X = V2 * sqrt(-2 * log(S) / S); 74 75 phase = 1 - phase; 76 77 return X; 78 } 79 80 float gaussrand_f() 81 { 82 return float(gaussrand() * 0.01); 83 } 84 85 template<size_t IS_GPU> 86 struct Matrix {}; 87 88 template<> 89 struct Matrix<CPU> 90 { 91 size_t row, col, size; 92 float *data; 93 94 Matrix(size_t _row, size_t _col) : 95 row(_row), col(_col), size(_row * _col) 96 { 97 assert(row > 0 && col > 0); 98 data = (float *)malloc(size * sizeof(float)); 99 assert(data); 100 memset(data, 0, row * col * sizeof(float)); 101 } 102 103 Matrix(size_t _row, size_t _col, float *_data) : 104 row(_row), col(_col), size(_row * _col), data(_data) 105 { 106 assert(row > 0 && col > 0 && data); 107 } 108 109 Matrix splice(size_t from, size_t to) 110 { 111 assert(from < to && to <= row); 112 size_t offset = from * col; 113 114 Matrix<CPU> ret(to - from, col, data + offset); 115 return ret; 116 } 117 118 string shape() const { 119 char buf[100]; 120 sprintf(buf, "%d x %d", (int)row, (int)col); 121 return buf; 122 } 123 124 void free() 125 { 126 assert(data); 127 ::free(data); 128 data = NULL; 129 } 130 131 void init(float v) 132 { 133 for(int i = 0;i < row;i++) { 134 for(int j = 0;j < col;j++) { 135 (*this)[i][j] = v; 136 } 137 } 138 } 139 140 void init_gauss() 141 { 142 for(int i = 0;i < row;i++) { 143 for(int j = 0;j < col;j++) { 144 (*this)[i][j] = gaussrand_f(); 145 } 146 } 147 } 148 149 float *operator[](size_t i) { 150 assert(i < row); 151 return data + i * col; 152 } 153 154 const float *operator[] (size_t i) const { 155 assert(i < row); 156 return data + i * col; 157 } 158 159 void assert_eq(const Matrix& rhs) 160 { 161 assert(row == rhs.row); 162 assert(col == rhs.col); 163 for(int i = 0;i < row;i++) { 164 for(int j = 0;j < col;j++) { 165 float x = (*this)[i][j]; 166 float y = rhs[i][j]; 167 float d = x - y; 168 if(d < 0) d = -d; 169 if(d > 1E-7) { 170 cerr << "[" << i << "," << j << "] delta=" << d << " x=" << x << " y=" << y << endl; 171 exit(-1); 172 } 173 } 174 } 175 } 176 177 size_t max_idx() const { 178 assert(row == 1); 179 size_t ret = 0; 180 for(int i = 1;i < col;i++) { 181 if((*this)[0][i] > (*this)[0][ret]) { 182 ret = i; 183 } 184 } 185 return ret; 186 } 187 188 float sum() const { 189 float ret = 0; 190 for(int i = 0;i < row;i++) { 191 for(int j = 0;j < col;j++) { 192 ret += (*this)[i][j]; 193 } 194 } 195 return ret; 196 } 197 198 void mul_to(const Matrix& in, Matrix& out) const { 199 // out x in 200 // b x in 201 // b x out 202 assert(row == out.col && col == in.col && in.row == out.row); 203 for(int b = 0;b < in.row;b++) { 204 for(int i = 0;i < row;i++) { 205 out[b][i] = 0; 206 for(int j = 0;j < col;j++) { 207 out[b][i] += (*this)[i][j] * in[b][j]; 208 } 209 } 210 } 211 } 212 213 Matrix t() const { 214 Matrix ret(col, row); 215 for(int i = 0;i < row;i++) { 216 for(int j = 0;j < col;j++) { 217 ret[j][i] = (*this)[i][j]; 218 } 219 } 220 return ret; 221 } 222 223 void t_and_mul_to(const Matrix& in, Matrix& out) const { 224 // out x in 225 // b x out 226 // b x in 227 assert(row == in.col && col == out.col && in.row == out.row); 228 for(int b = 0;b < in.row;b++) { 229 for(int j = 0;j < col;j++) { 230 out[b][j] = 0; 231 for(int i = 0;i < row;i++) { 232 out[b][j] += (*this)[i][j] * in[b][i]; 233 } 234 } 235 } 236 } 237 238 void cast_add_to(Matrix& out) const { 239 // 1 x out 240 // b x out 241 assert(row == 1 && col == out.col); 242 for(int b = 0;b < out.row;b++) { 243 for(int j = 0;j < col;j++) { 244 out[b][j] += (*this)[0][j]; 245 } 246 } 247 } 248 249 void grad(float f, Matrix& delta) { 250 // 1 x out 251 // b x out 252 assert(row == 1 && col == delta.col); 253 for(int j = 0;j < col;j++) { 254 float sum = 0; 255 for(int b = 0;b < delta.row;b++) { 256 sum += delta[b][j]; 257 } 258 sum *= f; 259 (*this)[0][j] += sum; 260 } 261 } 262 263 void grad(float f, const Matrix& delta, const Matrix& prev_a) { 264 // out x in 265 // b x out 266 // b x in 267 assert(row == delta.col && col == prev_a.col); 268 for(int i = 0;i < row;i++) { 269 for(int j = 0;j < col;j++) { 270 float sum = 0; 271 for(int b = 0;b < delta.row;b++) { 272 sum += delta[b][i] * prev_a[b][j]; 273 } 274 sum *= f; 275 (*this)[i][j] += sum; 276 } 277 } 278 } 279 280 Matrix to_cpu() const { 281 Matrix ret(row, col); 282 memcpy(ret.data, data, size * sizeof(float)); 283 return ret; 284 } 285 286 #ifdef GPU 287 Matrix<GPU> to_gpu() const; 288 #endif 289 }; 290 291 ostream& operator<<(ostream& out, const Matrix<CPU>& v) 292 { 293 out << "[(" << v.row << " x " << v.col << ") " << endl; 294 for(int i = 0;i < v.row;i++) { 295 out << "\\t"; 296 for(int j = 0;j < v.col;j++) { 297 if(j > 0) cout << ","; 298 out << v[i][j]; 299 } 300 out << endl; 301 } 302 out << "]"; 303 return out; 304 } 305 306 #ifdef GPU 307 class SmartGPUMem 308 { 309 private: 310 float *m_gpu; 311 312 public: 313 SmartGPUMem() 314 { 315 CK(cudaMalloc((void **)&m_gpu, sizeof *m_gpu)); 316 assert(m_gpu); 317 } 318 319 float *gpu() { return m_gpu; } 320 float cpu() 321 { 322 float t; 323 CK(cudaMemcpy(&t, m_gpu, sizeof t, cudaMemcpyDeviceToHost)); 324 return t; 325 } 326 327 ~SmartGPUMem() 328 { 329 CK(cudaFree(m_gpu)); 330 } 331 }; 332 333 __global__ void k_sum(const float *data, size_t row, size_t col, float *out) { 334 float t = 0; 335 for(int i = 0;i < row;i++) { 336 for(int j = 0;j < col;j++) { 337 t += data[i * col + j]; 338 } 339 } 340 *out = t; 341 } 342 343 __global__ void k_max_idx(const float *data, size_t size, float *out) { 344 int t = 0; 345 for(int i = 1;i < size;i++) { 346 if(data[i] > data[t]) { 347 t = i; 348 } 349 } 350 *out = t; 351 } 352 353 __global__ void k_mul_to(const float * w, const float *in, float *out, size_t row, size_t col, size_t batch_size) 354 { 355 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 356 int b = gidx / row; 357 int i = gidx % row; 358 359 if(b >= batch_size) return; 360 361 float t = 0; 362 for(int j = 0;j < col;j++) { 363 t += w[i * col + j] * in[b * col + j]; 364 } 365 out[b * row + i] = t; 366 } 367 368 __global__ void k_t_and_mul_to(const float *w, const float *in, float *out, size_t row, size_t col, size_t batch_size) 369 { 370 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 371 int b = gidx / col; 372 int j = gidx % col; 373 374 if(b >= batch_size) return; 375 376 float t = 0; 377 for(int i = 0;i < row;i++) { 378 t += w[i * col + j] * in[b * row + i]; 379 } 380 out[b * col + j] = t; 381 } 382 383 __global__ void k_cast_add_to(const float* bx, float *out, size_t col, size_t batch_size) 384 { 385 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 386 int b = gidx / col; 387 int j = gidx % col; 388 389 if(b >= batch_size) return; 390 out[b * col + j] += bx[j]; 391 } 392 393 __global__ void k_grad(float *b, float f, const float *delta, size_t col, size_t batch_size) 394 { 395 size_t j = blockDim.x * blockIdx.x + threadIdx.x; 396 if(j >= col) return; 397 398 float sum = 0; 399 for(int b = 0;b < batch_size;b++) { 400 sum += delta[b * col + j]; 401 } 402 sum *= f; 403 b[j] += sum; 404 } 405 406 __global__ void k_grad(float *w, float f, const float *delta, const float *prev_a, size_t row, size_t col, size_t batch_size) 407 { 408 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 409 size_t i = gidx / col; 410 size_t j = gidx % col; 411 if(i >= row) return; 412 413 float sum = 0; 414 for(int b = 0;b < batch_size;b++) { 415 sum += delta[b * row + i] * prev_a[b * col + j]; 416 } 417 sum *= f; 418 w[i * col + j] += sum; 419 } 420 421 template<> 422 struct Matrix<GPU> 423 { 424 size_t row, col, size; 425 float *data; 426 427 Matrix(size_t _row, size_t _col) : 428 row(_row), col(_col), size(_row * _col) 429 { 430 assert(row > 0 && col > 0); 431 CK(cudaMalloc((void **)&data, size * sizeof(float))); 432 assert(data); 433 434 float *tmp = (float*)malloc(size * sizeof(float)); 435 assert(tmp); 436 memset(tmp, 0, size * sizeof(float)); 437 CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 438 ::free(tmp); 439 } 440 441 Matrix(size_t _row, size_t _col, float *_data) : 442 row(_row), col(_col), size(_row * _col), data(_data) 443 { 444 assert(row > 0 && col > 0 && data); 445 } 446 447 Matrix splice(size_t from, size_t to) 448 { 449 assert(from < to && to <= row); 450 size_t offset = from * col; 451 452 Matrix ret(to - from, col, data + offset); 453 return ret; 454 } 455 456 string shape() const { 457 char buf[100]; 458 sprintf(buf, "%d x %d", (int)row, (int)col); 459 return buf; 460 } 461 462 void free() 463 { 464 assert(data); 465 CK(cudaFree(data)); 466 data = NULL; 467 } 468 469 void init(float v) 470 { 471 float *tmp = (float*)malloc(size * sizeof(float)); 472 assert(tmp); 473 for(int i = 0;i < size;i++) { 474 tmp[i] = v; 475 } 476 CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 477 ::free(tmp); 478 } 479 480 void init_gauss() 481 { 482 float *tmp = (float*)malloc(size * sizeof(float)); 483 assert(tmp); 484 for(int i = 0;i < size;i++) { 485 tmp[i] = gaussrand_f(); 486 } 487 CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 488 ::free(tmp); 489 } 490 491 __device__ float *operator[](size_t i) { 492 assert(i < row); 493 return data + i * col; 494 } 495 496 __device__ const float *operator[] (size_t i) const { 497 assert(i < row); 498 return data + i * col; 499 } 500 501 size_t max_idx() const { 502 assert(row == 1); 503 SmartGPUMem mem; 504 k_max_idx<<<1,1>>>(data, col, mem.gpu()); 505 return mem.cpu(); 506 } 507 508 float sum() const { 509 SmartGPUMem mem; 510 k_sum<<<1,1>>>(data, row, col, mem.gpu()); 511 return mem.cpu(); 512 } 513 514 void mul_to(const Matrix& in, Matrix& out) const { 515 // out x in 516 // b x in 517 // b x out 518 assert(row == out.col && col == in.col && in.row == out.row); 519 size_t thread_cnt = get_thread_cnt(in.row * row); 520 size_t block_cnt = ceil(in.row * row, thread_cnt); 521 522 k_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row); 523 524 /* 525 for(int b = 0;b < in.row;b++) { 526 for(int i = 0;i < row;i++) { 527 out[b][i] = 0; 528 for(int j = 0;j < col;j++) { 529 out[b][i] += (*this)[i][j] * in[b][j]; 530 } 531 } 532 } 533 */ 534 } 535 536 537 void t_and_mul_to(const Matrix& in, Matrix& out) const { 538 // out x in 539 // b x out 540 // b x in 541 assert(row == in.col && col == out.col && in.row == out.row); 542 size_t thread_cnt = get_thread_cnt(in.row * col); 543 size_t block_cnt = ceil(in.row * col, thread_cnt); 544 545 k_t_and_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row); 546 /* 547 for(int b = 0;b < in.row;b++) { 548 for(int j = 0;j < col;j++) { 549 out[b][j] = 0; 550 for(int i = 0;i < row;i++) { 551 out[b][j] += (*this)[i][j] * in[b][i]; 552 } 553 } 554 } 555 */ 556 } 557 558 void cast_add_to(Matrix& out) const { 559 // 1 x out 560 // b x out 561 assert(row == 1 && col == out.col); 562 size_t thread_cnt = get_thread_cnt(out.row * col); 563 size_t block_cnt = ceil(out.row * col, thread_cnt); 564 565 k_cast_add_to<<<block_cnt, thread_cnt>>>(data, out.data, col, out.row); 566 /* 567 for(int b = 0;b < out.row;b++) { 568 for(int j = 0;j < col;j++) { 569 out[b][j] += (*this)[0][j]; 570 } 571 } 572 */ 573 } 574 575 void grad(float f, const Matrix& delta) { 576 // 1 x out 577 // b x out 578 assert(row == 1 && col == delta.col); 579 size_t thread_cnt = get_thread_cnt(col); 580 size_t block_cnt = ceil(col, thread_cnt); 581 582 k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, col, delta.row); 583 /* 584 for(int j = 0;j < col;j++) { 585 float sum = 0; 586 for(int b = 0;b < delta.row;b++) { 587 sum += delta[b][j]; 588 } 589 sum *= f; 590 (*this)[0][j] += sum; 591 } 592 */ 593 } 594 595 void grad(float f, const Matrix& delta, const Matrix& prev_a) { 596 // out x in 597 // b x out 598 // b x in 599 assert(row == delta.col && col == prev_a.col); 600 size_t thread_cnt = get_thread_cnt(row * col); 601 size_t block_cnt = ceil(row * col, thread_cnt); 602 603 k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, prev_a.data, row, col, delta.row); 604 /* 605 for(int i = 0;i < row;i++) { 606 for(int j = 0;j < col;j++) { 607 float sum = 0; 608 for(int b = 0;b < delta.row;b++) { 609 sum += delta[b][i] * prev_a[b][j]; 610 } 611 sum *= f; 612 (*this)[i][j] += sum; 613 } 614 } 615 */ 616 } 617 618 Matrix<CPU> to_cpu() const { 619 Matrix<CPU> ret(row, col); 620 CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyDeviceToHost)); 621 return ret; 622 } 623 }; 624 625 Matrix<GPU> Matrix<CPU>::to_gpu() const { 626 Matrix<GPU> ret(row, col); 627 CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyHostToDevice)); 628 return ret; 629 } 630 #endif 631 632 template<size_t IS_GPU> 633 struct Softmax{}; 634 635 __host__ __device__ float softmax_calc(const float *x, const float *y, size_t size) 636 { 637 /* 638 log( exp(x_j) / \\sum exp(x_k) ) 639 = x_j - log \\sum exp(x_k) 640 = x_j - (max + log \\sum exp(x_k - max)) 641 */ 642 float maxX = x[0]; 643 for(int i = 0;i < size;i++) { 644 if(x[i] > maxX) { 645 maxX = x[i]; 646 } 647 } 648 649 float xSum = 0; 650 for(int i = 0;i < size;i++) { 651 xSum += expf(x[i] - maxX); 652 } 653 654 float ret = 0; 655 for(int i = 0;i < size;i++) { 656 ret += y[i] * (x[i] - (maxX + logf(xSum))); 657 } 658 659 return -ret; 660 } 661 662 __host__ __device__ void softmax_propagate_delta(const float *x, const float *y, float *z, size_t size, float f) 663 { 664 /* 665 - d \\sum y_j * log( exp(x_j) / \\sum exp(x_k) ) 666 = - d \\sum y_j * x_j - d \\sum y_j log (\\sum exp(x_k) ) 667 = - y_i + \\sum (y_j * exp(x_i) / \\sum exp(x_k)) 668 = - y_i + exp(x_i) (\\sum y_j) / (\\sum exp(x_k)) 669 */ 670 671 float maxX = x[0]; 672 for(int i = 0;i < size;i++) { 673 if(x[i] > maxX) { 674 maxX = x[i]; 675 } 676 } 677 678 // y - exp(x) sum_of_y / sum_of_exp(x) 679 float sumOfY = 0; 680 float sumOfX = 0; 681 for(int i = 0;i < size;i++) { 682 z[i] = expf(x[i] - maxX); 683 sumOfY += y[i]; 684 sumOfX += z[i]; 685 } 686 687 float yx = sumOfY/sumOfX; 688 for(int i = 0;i < size;i++) { 689 z[i] = yx * z[i] - y[i]; 690 } 691 692 for(int i = 0;i < size;i++) { 693 z[i] *= f; 694 } 695 } 696 697 template<> 698 struct Softmax<CPU> 699 { 700 // - \\sum y_j * log( exp(x_j) / \\sum exp(x_k) ) 701 void calc(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& out) const 702 { 703 assert(x.row == y.row && x.col == y.col); 704 assert(out.row == 1 && out.col == x.row); 705 for(int i = 0;i < x.row;i++) { 706 out[0][i] = softmax_calc(x[i], y[i], x.col); 707 } 708 } 709 710 void propagate_delta(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& z) const 711 { 712 assert(x.row == y.row && x.col == y.col); 713 assert(x.row == z.row && x.col == z.col); 714 715 for(int i = 0;i < x.row;i++) { 716 softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row); 717 } 718 } 719 }; 720 721 #ifdef GPU 722 __global__ void k_calc(const float *x, const float *y, float *out, size_t row, size_t col) 723 { 724 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 725 if(gidx >= row) return; 726 size_t i = gidx; 727 728 out[i] = softmax_calc(x + i * col, y + i * col, col); 729 } 730 731 __global__ void k_propagate_delta(const float *x, const float *y, float *z, size_t row, size_t col, float f) 732 { 733 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 734 if(gidx >= row) return; 735 size_t i = gidx; 736 737 softmax_propagate_delta(x + col * i, y + col * i, z + col * i, col, f); 738 } 739 740 template<> 741 struct Softmax<GPU> 742 { 743 void calc(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& out) const 744 { 745 assert(x.row == y.row && x.col == y.col); 746 assert(out.row == 1 && out.col == x.row); 747 748 size_t thread_cnt = get_thread_cnt(x.row); 749 size_t block_cnt = ceil(x.row, thread_cnt); 750 k_calc<<<block_cnt, thread_cnt>>>(x.data, y.data, out.data, x.row, x.col); 751 /* 752 for(int i = 0;i < x.row;i++) { 753 out[0][i] = softmax_calc(x[i], y[i], x.col); 754 } 755 */ 756 } 757 758 void propagate_delta(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& z) const 759 { 760 assert(x.row == y.row && x.col == y.col); 761 assert(x.row == z.row && x.col == z.col); 762 763 size_t thread_cnt = get_thread_cnt(x.row); 764 size_t block_cnt = ceil(x.row, thread_cnt); 765 k_propagate_delta<<<block_cnt, thread_cnt>>>(x.data, y.data, z.data, x.row, x.col, 1.0/x.row); 766 /* 767 for(int i = 0;i < x.row;i++) { 768 softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row); 769 } 770 */ 771 } 772 }; 773 #endif 774 775 template<size_t IS_GPU> 776 struct Relu{}; 777 778 template<> 779 struct Relu<CPU> 780 { 781 void forward(const Matrix<CPU>& x, Matrix<CPU> &y) const 782 { 783 assert(x.row == y.row && x.col == y.col); 784 for(int i = 0;i < x.row;i++) { 785 for(int j = 0;j < x.col;j++) { 786 float t = x[i][j]; 787 y[i][j] = t > 0 ? t : 0; 788 } 789 } 790 } 791 792 void derive_and_dot_into(const Matrix<CPU>& x, Matrix<CPU> &y) const 793 { 794 assert(x.row == y.row && x.col == y.col); 795 for(int i = 0;i < x.row;i++) { 796 for(int j = 0;j < x.col;j++) { 797 float t = x[i][j]; 798 y[i][j] *= t > 0 ? 1 : 0; 799 } 800 } 801 } 802 }; 803 804 #ifdef GPU 805 __global__ void k_forward(const float *x, float *y, size_t row, size_t col) 806 { 807 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 808 int i = gidx / col; 809 int j = gidx % col; 810 if(i >= row) return; 811 812 float t = x[i * col + j]; 813 y[i * col + j] = t > 0 ? t : 0; 814 } 815 816 __global__ void k_derive_and_dot_into(const float *x, float *y, size_t row, size_t col) 817 { 818 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 819 int i = gidx / col; 820 int j = gidx % col; 821 if(i >= row) return; 822 823 float t = x[i * col + j]; 824 y[i * col + j] *= t > 0 ? 1 : 0; 825 } 826 827 828 template<> 829 struct Relu<GPU> 830 { 831 void forward(const Matrix<GPU>& x, Matrix<GPU> &y) const 832 { 833 assert(x.row == y.row && x.col == y.col); 834 size_t thread_cnt = get_thread_cnt(x.row * x.col); 835 size_t block_cnt = ceil(x.row * x.col, thread_cnt); 836 k_forward<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col); 837 /* 838 for(int i = 0;i < x.row;i++) { 839 for(int j = 0;j < x.col;j++) { 840 float t = x[i][j]; 841 y[i][j] = t > 0 ? t : 0; 842 } 843 } 844 */ 845 } 846 847 void derive_and_dot_into(const Matrix<GPU>& x, Matrix<GPU> &y) const 848 { 849 assert(x.row == y.row && x.col == y.col); 850 size_t thread_cnt = get_thread_cnt(x.row * x.col); 851 size_t block_cnt = ceil(x.row * x.col, thread_cnt); 852 k_derive_and_dot_into<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col); 853 /* 854 for(int i = 0;i < x.row;i++) { 855 for(int j = 0;j < x.col;j++) { 856 float t = x[i][j]; 857 y[i][j] *= t > 0 ? 1 : 0; 858 } 859 } 860 */ 861 } 862 }; 863 #endif 864 865 template<size_t IS_GPU> 866 struct Layer 867 { 868 size_t in_size, out_size, batch_size; 869 Matrix<IS_GPU> w, b; 870 Relu<IS_GPU> relu; 871 872 // z = w * in + b 873 // a = relu(z) 874 Matrix<IS_GPU> z, a, delta; 875 876 Layer(size_t _in_size, size_t _out_size, size_t _batch_size=1) : 877 in_size(_in_size), out_size(_out_size), batch_size(_batch_size), 878 w(_out_size, _in_size), b(1, _out_size), 879 z(_batch_size, _out_size), a(_batch_size, _out_size), 880 delta(_batch_size, _out_size) 881 { 882 assert(in_size > 0); 883 assert(out_size > 0); 884 assert(batch_size > 0); 885 w.init_gauss(); 886 b.init_gauss(); 887 } 888 889 void reset_batch_size(size_t _batch_size) 890 { 891 assert(_batch_size > 0); 892 if(_batch_size == batch_size) return; 893 batch_size = _batch_size; 894 895 z.free(); 896 a.free(); 897 delta.free(); 898 z = Matrix<IS_GPU>(batch_size, out_size); 899 a = Matrix<IS_GPU>(batch_size, out_size); 900 delta = Matrix<IS_GPU>(batch_size, out_size); 901 } 902 903 void calc(const Matrix<IS_GPU>& in) 904 { 905 assert(in.row == batch_size); 906 assert(in.col == in_size); 907 908 // out x in 909 // batch x in 910 // batch x out 911 w.mul_to(in, z); 912 913 // 1 x out 914 // batch x out 915 b.cast_add_to(z); 916 917 // batch x out 918 // batch x out 919 relu.forward(z, a); 920 } 921 922 void propagate_delta(Matrix<IS_GPU>& out) 923 { 924 assert(out.row == batch_size); 925 assert(out.col == in_size); 926 927 // out x in 928 // batch x out 929 // batch x in 930 w.t_and_mul_to(delta, out); 931 } 932 933 void update_parameters(float alpha, const Matrix<IS_GPU> &prev_a) 934 { 935 assert(prev_a.row == batch_size); 936 assert(prev_a.col == in_size); 937 938 // 1 x out 939 // 1 x out 940 b.grad(-alpha, delta); 941 942 // out x in 943 // batch x out 944 // batch x in 945 w.grad(-alpha, delta, prev_a); 946 } 947 }; 948 949 int MsbInt(char buf[]) 950 { 951 int base = 1; 952 int ret = 0; 953 for(int i = 3;i >= 0;i--) { 954 ret += (unsigned char)buf[i] * base; 955 base *= 256; 956 } 957 return ret; 958 } 959 960 vector<int> ReadMnistLabels(string fileName) 961 { 962 vector<int> ret; 963 ifstream ifs(fileName.c_str(), ios::binary); 964 char buf[1000]; 965 966 // MAGIC 967 ifs.read(buf, 4); 968 int magic = MsbInt(buf); 969 assert(magic == 0x00000801); 970 971 // num of images 972 ifs.read(buf, 4); 973 int nImages = MsbInt(buf); 974 975 while(nImages--) { 976 ret.push_back(ifs.get()); 977 } 978 979 return ret; 980 } 981 982 Matrix<CPU> ReadMnistData(const string& fileName) 983 { 984 ifstream ifs(fileName.c_str(), ios::binary); 985 char buf[1000]; 986 987 // MAGIC 988 ifs.read(buf, 4); 989 int magic = MsbInt(buf); 990 assert(magic == 0x00000803); 991 992 // num of images 993 ifs.read(buf, 4); 994 int nImages = MsbInt(buf); 995 996 int row, col; 997 ifs.read(buf, 4); 998 row = MsbInt(buf); 999 ifs.read(buf, 4); 1000 col = MsbInt(buf); 1001 assert(row == 28 && col == 28); 1002 1003 size_t size = row * col; 1004 1005 Matrix<CPU> ret(nImages, size); 1006 1007 for(int k = 0;k < nImages;k++) { 1008 for(int i = 0;i < row * col;i++) { 1009 ret[k][i] = ifs.get() / 256.0; // 归一化 1010 } 1011 } 1012 1013 return ret; 1014 } 1015 1016 Matrix<CPU> translateLabels(const vector<int>& labels, int k=10) 1017 { 1018 Matrix<CPU> ret(labels.size(), k); 1019 1020 for(int i = 0;i < labels.size();i++) { 1021 assert(labels[i] >= 0 && labels[i] < k); 1022 ret[i][labels[i]] = 1; 1023 } 1024 return ret; 1025 } 1026 1027 template<size_t IS_GPU> 1028 void reset_model_batch_size(vector<Layer<IS_GPU> >&model, size_t batch_size) 1029 { 1030 for(int i = 0;i < model.size();i++) { 1031 model[i].reset_batch_size(batch_size); 1032 } 1033 } 1034 1035 template<size_t IS_GPU> 1036 vector<bool> forward( 1037 vector<Layer<IS_GPU> >& model, 1038 const Matrix<IS_GPU>& in, 1039 const Matrix<IS_GPU>& label, 1040 const vector<int>& raw_label, 1041 const Softmax<IS_GPU>& s, 1042 float& error) 1043 { 1044 size_t batch_size = in.row; 1045 assert(model[0].batch_size == batch_size); 1046 assert(label.row == batch_size); 1047 assert(raw_label.size() == batch_size); 1048 1049 size_t nLayer = model.size(); 1050 1051 for(int j = 0;j < nLayer;j++) { 1052 Layer<IS_GPU> &layer = model[j]; 1053 if(j == 0) { 1054 layer.calc(in); 1055 } else { 1056 layer.calc(model[j-1].a); 1057 } 1058 } 1059 1060 Layer<IS_GPU> &lastLayer = model[nLayer - 1]; 1061 Matrix<IS_GPU> error2(1, batch_size); 1062 s.calc(lastLayer.a, label, error2); 1063 error = error2.sum() / batch_size; 1064 error2.free(); 1065 1066 Matrix<CPU> cpu_a = lastLayer.a.to_cpu(); 1067 vector<bool> ret(batch_size); 1068 for(int i = 0;i < batch_size;i++) { 1069 ret[i] = cpu_a.splice(i,i + 1).max_idx() == raw_label[i]; 1070 } 1071 cpu_a.free(); 1072 return ret; 1073 } 1074 1075 template<size_t IS_GPU> 1076 void run(Matrix<IS_GPU> &train_data, 1077 Matrix<IS_GPU> &train_label, 1078 vector<int> &raw_train_label, 1079 1080 Matrix<IS_GPU> &test_data, 1081 Matrix<IS_GPU> &test_label, 1082 vector<int> &raw_test_label, 1083 1084 vector<Layer<IS_GPU> >& model, 1085 size_t batch_size 1086 ) 1087 { 1088 clock_t start = clock(); 1089 1090 size_t M = train_data.row; 1091 size_t T = test_data.row; 1092 1093 Softmax<IS_GPU> s; 1094 float avg_error = 0; 1095 float error; 1096 for(int i = 0;i < M;i += batch_size) { 1097 size_t this_batch_size = std::min(batch_size, M - i); 1098 reset_model_batch_size(model, this_batch_size); 1099 Matrix<IS_GPU> this_batch_train_data = train_data.splice(i, i + this_batch_size); 1100 Matrix<IS_GPU> this_batch_train_label = train_label.splice(i, i + this_batch_size); 1101 vector<int> this_batch_raw_train_label(raw_train_label.begin() + i, raw_train_label.begin() + i + this_batch_size); 1102 1103 /* 1104 cout << this_batch_train_data.shape() << endl; 1105 cout << this_batch_train_label.shape() << endl; 1106 cout << this_batch_raw_train_label.size() << endl; 1107 */ 1108 1109 forward<IS_GPU>( 1110 model, 1111 this_batch_train_data, 1112 this_batch_train_label, 1113 this_batch_raw_train_label, 1114 s, 1115 error 1116 ); 1117 avg_error += error * this_batch_size; 1118 1119 // backward 1120 for(int j = model.size() - 1;j >= 0;j--) { 1121 Layer<IS_GPU> &layer = model[j]; 1122 if(j == model.size() - 1) { 1123 s.propagate_delta(layer.a, this_batch_train_label, layer.delta); 1124 } else { 1125 model[j + 1].propagate_delta(layer.delta); 1126 } 1127 layer.relu.derive_and_dot_into(layer.a, layer.delta); 1128 } 1129 1130 for(int j = 0;j < model.size();j++) { 1131 model[j].update_parameters(0.001, j == 0 ? this_batch_train_data : model[j-1].a); 1132 } 1133 } 1134 avg_error /= M; 1135 1136 clock_t mid = clock(); 1137 cout << "\\ttime=" << ((mid-start)*1.0/CLOCKS_PER_SEC) << " error=" << avg_error << endl; 1138 1139 // 测试 1140 reset_model_batch_size(model, M); 1141 vector<bool> is_good = forward(model, 1142 train_data.splice(0, M), 1143 train_label.splice(0, M), 1144 vector<int>(raw_train_label.begin(), raw_train_label.begin() + M), 1145 s, error); 1146 size_t total = 0, good = 0; 1147 for(int i = 0;i < M;i++) { 1148 if(is_good[i]) good++; 1149 total++; 1150 } 1151 cout << "\\ttrain_accuracy=" << (good*1.0/total) << " "; 1152 1153 total = good = 0; 1154 reset_model_batch_size(model, T); 1155 is_good = forward(model, 1156 test_data.splice(0, T), 1157 test_label.splice(0, T), 1158 vector<int>(raw_test_label.begin(), raw_test_label.begin() + T), 1159 s, error); 1160 for(int i = 0;i < T;i++) { 1161 if(is_good[i]) good++; 1162 total++; 1163 } 1164 cout << "test_accuracy=" << (good*1.0/total) << " "; 1165 1166 clock_t end = clock(); 1167 cout << "total_time=" << (end-start)*1.0/CLOCKS_PER_SEC << endl; 1168 } 1169 1170 void test() 1171 { 1172 /* 1173 0 1 1174 1 2 1175 2 3 1176 */ 1177 Matrix<CPU> a(3, 2); 1178 for(int i = 0;i < a.row;i++) { 1179 for(int j = 0;j < a.col;j++) { 1180 a[i][j] = i + j; 1181 } 1182 } 1183 //Matrix splice(size_t from, size_t to) { 1184 Matrix<CPU> t = a.splice(1,2); 1185 assert(t.row == 1 && t.col == 2 && t[0][0] == 1); 1186 1187 //string shape() const { 1188 assert(a.shape() == "3 x 2"); 1189 1190 //void free() { 1191 t = Matrix<CPU>(1,1); 1192 t.free(); 1193 assert(!t.data); 1194 1195 //void init(float v) { 1196 t = Matrix<CPU>(2,2); 1197 t.init(2); 1198 assert(t[1][1] == 2); 1199 1200 //void init_gauss() { 1201 t = Matrix<CPU>(2,2); 1202 assert(t[1][1] == 0); 1203 t.init_gauss(); 1204 assert(t[1][1] != 0); 1205 1206 //float *operator[](size_t i) { 1207 //const float *operator[] (size_t i) const { 1208 1209 //void assert_eq(const Matrix& rhs) 1210 t = Matrix<CPU>(2,2); 1211 Matrix<CPU> t2(2,2); 1212 t.assert_eq(t2); 1213 1214 //size_t max_idx() const { 1215 assert(a.splice(0,1).max_idx() == 1); 1216 1217 //float sum() const { 1218 assert(a.splice(1,2).sum() == 3); 1219 1220 //void mul_to(const Matrix& in, Matrix& out) const { 1221 /* 1222 0 1 1223 1 2 1224 2 3 1225 1226 x 1227 1228 1 2 1229 3 4 1230 1231 = 1232 1233 2 5 8 1234 4 11 18 1235 */ 1236 Matrix<CPU> b(2, 2); 1237 b[0][0] = 1; b[0][1] = 2; 1238 b[1][0] = 3; b[1][1] = 4; 1239 Matrix<CPU> out(2, 3); 1240 a.mul_to(b, out); 1241 assert(out[0][0] == 2 && out[0][1] == 5 && out[0][2] == 8); 1242 assert(out[1][0] == 4 && out[1][1] == 11 && out[1][2] == 18); 1243 1244 1245 //void t_and_mul_to(const Matrix& in, Matrix& out) const { 1246 /* 1247 0 1 1248 1 2 -> 0 1 2 1249 2 3 1 2 3 1250 1251 x 1252 1253 1 2 3 1254 4 5 6 1255 1256 = 1257 1258 8 14 1259 17 32 1260 */ 1261 //Matrix<CPU> ap = a.t(); 1262 b = Matrix<CPU>(2, 3); 1263 b[0][0] = 1; b[0][1] = 2; b[0][2] = 3; 1264 b[1][0] = 4; b[1][1] = 5; b[1][2] = 6; 1265 out = Matrix<CPU>(2, 2); 1266 a.t_and_mul_to(b, out); 1267 //cout << out << endl; 1268 //Matrix<CPU> out2(2, 2); 1269 //ap.mul_to(b, out2); 1270 //cout << out2 << endl; 1271 //out.assert_eq(out2); 1272 assert(out[0][0] == 8 && out[0][1] == 14); 1273 assert(out[1][0] == 17 && out[1][1] == 32); 1274 1275 //void cast_add_to(Matrix& out) const { 1276 /* 1277 1 2 1278 + 1279 1 2 1280 3 4 1281 = 1282 2 4 1283 4 6 1284 */ 1285 b = Matrix<CPU>(1, 2); 1286 b[0][0] = 1; b[0][1] = 2; 1287 out = Matrix<CPU>(2,2); 1288 out[0][0] = 1; out[0][1] = 2; 1289 out[1][0] = 3; out[1][1] = 4; 1290 b.cast_add_to(out); 1291 assert(out[0][0] == 2 && out[0][1] == 4); 1292 assert(out[1][0] == 4 && out[1][1] == 6); 1293 1294 //void grad(float f, Matrix& delta) { 1295 /* 1296 1 2 1297 grad with f = -0.5 1298 1 2 1299 3 4 1300 = 1301 -1 -1 1302 */ 1303 b = Matrix<CPU>(1, 2); 1304 b[0][0] = 1; b[0][1] = 2; 1305 out = Matrix<CPU>(2,2); 1306 out[0][0] = 1; out[0][1] = 2; 1307 out[1][0] = 3; out[1][1] = 4; 1308 b.grad(-0.5, out); 1309 assert(b[0][0] == -1 && b[0][1] == -1); 1310 1311 //void grad(float f, const Matrix& delta, const Matrix& prev_a) { 1312 /* 1313 0 1 1314 1 2 1315 2 3 1316 1317 grad with f = -0.5 1318 1319 delta= 1320 1 2 3 -> 1 1321 4 5 6 2 1322 3 1323 1324 prev_a= 1325 7 8 -> 7 1326 9 10 8 1327 1328 = 1329 1330 [0][1] = 1 - 0.5 * (1 * 8 + 4 * 10) = 1 - 0.5 * 48 = 1 - 24 = -23 1331 */ 1332 Matrix<CPU> w(3, 2); 1333 w[0][0] = 0; w[0][1] = 1; 1334 w[1][0] = 1; w[1][1] = 2; 1335 w[2][0] = 2; w[2][1] = 3; 1336 1337 Matrix<CPU> delta(2, 3); 1338 delta[0][0] = 1; delta[0][1] = 2; delta[0][2] = 3; 1339 delta[1][0] = 4; delta[1][1] = 5; delta[1][2] = 6; 1340 1341 Matrix<CPU> prev_a(2, 2); 1342 prev_a[0][0] = 7; prev_a[0][1] = 8; 1343 prev_a[1][0] = 9; prev_a[1][1] = 10; 1344 1345 w.grad(-0.5, delta, prev_a); 1346 //cout << w << endl; 1347 assert(w[0][1] == -23); 1348 1349 //Matrix to_cpu() const { 1350 t = a.to_cpu(); 1351 t.assert_eq(a); 1352 } 1353 1354 int main() 1355 { 1356 cout << "Testing" << endl; 1357 test(); 1358 //return 0; 1359 1360 cout << "Loading data" << endl; 1361 // 读取数据 1362 vector<int> raw_train_label = ReadMnistLabels("mnist/train-labels-idx1-ubyte"); 1363 //assert(raw_train_label.size() == 60000); 1364 Matrix<CPU> cpu_train_data = ReadMnistData("mnist/train-images-idx3-ubyte"); 1365 //assert(cpu_train_data.row == 60000 && cpu_train_data.col == 28 * 28); 1366 Matrix<CPU> cpu_train_label = translateLabels(raw_train_label); 1367 //assert(cpu_train_label.row == 60000 && cpu_train_label.col == 10); 1368 #ifdef GPU 1369 Matrix<GPU> gpu_train_data(cpu_train_data.to_gpu()); 1370 Matrix<GPU> gpu_train_label(cpu_train_label.to_gpu()); 1371 #endif 1372 1373 1374 vector<int> raw_test_label = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte"); 1375 //assert(raw_test_label.size() == 10000); 1376 Matrix<CPU> cpu_test_data = ReadMnistData("mnist/t10k-images-idx3-ubyte"); 1377 //assert(cpu_test_data.row == 10000 && cpu_test_data.col == 28 * 28); 1378 Matrix<CPU> cpu_test_label = translateLabels(raw_test_label); 1379 //assert(cpu_test_label.row == 10000 && cpu_test_label.col == 10); 1380 #ifdef GPU 1381 Matrix<GPU> gpu_test_data(cpu_test_data.to_gpu()); 1382 Matrix<GPU> gpu_test_label(cpu_test_label.to_gpu()); 1383 #endif 1384 1385 size_t n_input = cpu_train_data.col; 1386 size_t n_output = 10; 1387 size_t n_mid = 100; 1388 size_t batch_size = 256; 1389 1390 cout << "Now run" << endl; 1391 srand(1000); 1392 vector<Layer<CPU> > cpu_model; 1393 cpu_model.push_back(Layer<CPU>(n_input, n_mid)); 1394 cpu_model.push_back(Layer<CPU>(n_mid, n_output)); 1395 #ifdef GPU 1396 srand(1000); 1397 vector<Layer<GPU> > gpu_model; 1398 gpu_model.push_back(Layer<GPU>(n_input, n_mid)); 1399 gpu_model.push_back(Layer<GPU>(n_mid, n_output)); 1400 #endif 1401 1402 for(int i = 0; i < 5;i++) { 1403 cout << "cpu-" << (i+1) << endl; 1404 run<CPU>(cpu_train_data, cpu_train_label, raw_train_label, cpu_test_data, cpu_test_label, raw_test_label, cpu_model, batch_size); 1405 #ifdef GPU 1406 cout << "gpu-" << (i+1) << endl; 1407 run<GPU>(gpu_train_data, gpu_train_label, raw_train_label, gpu_test_data, gpu_test_label, raw_test_label, gpu_model, batch_size); 1408 #endif 1409 } 1410 1411 cout << "Done" << endl; 1412 return 0; 1413 }
以上是关于CUDA跑MNIST,加速的主要内容,如果未能解决你的问题,请参考以下文章