CUDA跑MNIST,加速

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA跑MNIST,加速相关的知识,希望对你有一定的参考价值。

之前用CUDA写的版本,竟然还不如CPU。

经过好几次的尝试,得到两点经验:

1. CUDA的kernel不需要写得硕大无比。写大了之后,block数和thread数反而不好调整(之前都没有用上block),另外就是会导致数据管理非常复杂。kernel搞成细粒度之后好像也没那么多的影响。

2. 训练还得batch的来,最重要的影响就是能极大的增加数据并行性。

下面的代码,GPU vs CPU达到了接近20倍的加速,代码也不复杂。

   1 #include <iostream>
   2 #include <cstdlib>
   3 #include <cassert>
   4 #include <string>
   5 #include <cstring>
   6 #include <fstream>
   7 #include <vector>
   8 #include <memory>
   9 #include <cstdlib>
  10 #include <cmath>
  11 #include <ctime>
  12 using namespace std;
  13 
  14 #define CPU 0
  15 #define GPU 1
  16 
  17 #ifndef GPU
  18 #define __host__ 
  19 #define __device__
  20 #define expf exp
  21 #define logf log
  22 #endif
  23 
  24 #ifdef GPU
  25 #include <cuda_runtime.h>
  26 #include <math_functions.h>
  27 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo)
  28 {
  29     if(code == cudaSuccess) return;
  30     cerr << "Cuda call failed at " << fileName << ":" << lineNo 
  31         << " " << cudaGetErrorString(code) << endl;
  32     exit(-1);
  33 }
  34 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__)
  35 
  36 
  37 inline size_t get_thread_cnt(size_t size)
  38 {
  39     size = min(size, 1024UL);
  40 
  41     // ceil(size / 32)
  42     size += 31;
  43     size &= ~31;
  44     return size;
  45 }
  46 
  47 inline size_t ceil(size_t x, size_t y)
  48 {
  49     return (x + y - 1) / y;
  50 }
  51 #endif
  52 
  53 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html
  54 // 高斯分布的随机数,均值为0,方差为1
  55 double gaussrand()
  56 {
  57     static double V1, V2, S;
  58     static int phase = 0;
  59     double X;
  60      
  61     if ( phase == 0 ) {
  62         do {
  63             double U1 = (double)rand() / RAND_MAX;
  64             double U2 = (double)rand() / RAND_MAX;
  65              
  66             V1 = 2 * U1 - 1;
  67             V2 = 2 * U2 - 1;
  68             S = V1 * V1 + V2 * V2;
  69         } while(S >= 1 || S == 0);
  70          
  71         X = V1 * sqrt(-2 * log(S) / S);
  72     } else
  73         X = V2 * sqrt(-2 * log(S) / S);
  74          
  75     phase = 1 - phase;
  76  
  77     return X;
  78 }
  79 
  80 float gaussrand_f()
  81 {
  82     return float(gaussrand() * 0.01);
  83 }
  84 
  85 template<size_t IS_GPU>
  86 struct Matrix {};
  87 
  88 template<>
  89 struct Matrix<CPU>
  90 {
  91     size_t row, col, size;
  92     float *data;
  93 
  94     Matrix(size_t _row, size_t _col) :
  95         row(_row), col(_col), size(_row * _col)
  96         {
  97             assert(row > 0 && col > 0);
  98             data = (float *)malloc(size * sizeof(float));
  99             assert(data);
 100             memset(data, 0, row * col * sizeof(float));
 101         }
 102 
 103     Matrix(size_t _row, size_t _col, float *_data) :
 104         row(_row), col(_col), size(_row * _col), data(_data)
 105         {
 106             assert(row > 0 && col > 0 && data);
 107         }
 108 
 109     Matrix splice(size_t from, size_t to)
 110     {
 111         assert(from < to && to <= row);
 112         size_t offset = from * col;
 113 
 114         Matrix<CPU> ret(to - from, col, data + offset);
 115         return ret;
 116     }
 117 
 118     string shape() const {
 119         char buf[100];
 120         sprintf(buf, "%d x %d", (int)row, (int)col);
 121         return buf;
 122     }
 123 
 124     void free()
 125     {
 126         assert(data);
 127         ::free(data);
 128         data = NULL;
 129     }
 130 
 131     void init(float v)
 132     {
 133         for(int i = 0;i < row;i++) {
 134             for(int j = 0;j < col;j++) {
 135                 (*this)[i][j] = v;
 136             }
 137         }
 138     }
 139 
 140     void init_gauss()
 141     {
 142         for(int i = 0;i < row;i++) {
 143             for(int j = 0;j < col;j++) {
 144                 (*this)[i][j] = gaussrand_f();
 145             }
 146         }
 147     }
 148 
 149     float *operator[](size_t i) {
 150         assert(i < row);
 151         return data + i * col;
 152     }
 153 
 154     const float *operator[] (size_t i) const {
 155         assert(i < row);
 156         return data + i * col;
 157     }
 158 
 159     void assert_eq(const Matrix& rhs)
 160     {
 161         assert(row == rhs.row);
 162         assert(col == rhs.col);
 163         for(int i = 0;i < row;i++) {
 164             for(int j = 0;j < col;j++) {
 165                 float x = (*this)[i][j];
 166                 float y = rhs[i][j];
 167                 float d =  x - y;
 168                 if(d < 0) d = -d;
 169                 if(d > 1E-7) {
 170                     cerr << "[" << i << "," << j << "] delta=" << d << " x=" << x << " y=" << y << endl;
 171                     exit(-1);
 172                 }
 173             }
 174         }
 175     }
 176 
 177     size_t max_idx() const {
 178         assert(row == 1);
 179         size_t ret = 0;
 180         for(int i = 1;i < col;i++) {
 181             if((*this)[0][i] > (*this)[0][ret]) {
 182                 ret = i;
 183             }
 184         }
 185         return ret;
 186     }
 187 
 188     float sum() const {
 189         float ret = 0;
 190         for(int i = 0;i < row;i++) {
 191             for(int j = 0;j < col;j++) {
 192                 ret += (*this)[i][j];
 193             }
 194         }
 195         return ret;
 196     }
 197 
 198     void mul_to(const Matrix& in, Matrix& out) const {
 199         // out x in
 200         // b x in
 201         // b x out
 202         assert(row == out.col && col == in.col && in.row == out.row);
 203         for(int b = 0;b < in.row;b++) {
 204             for(int i = 0;i < row;i++) {
 205                 out[b][i] = 0;
 206                 for(int j = 0;j < col;j++) {
 207                     out[b][i] += (*this)[i][j] * in[b][j];
 208                 }
 209             }
 210         }
 211     }
 212 
 213     Matrix t() const {
 214         Matrix ret(col, row);
 215         for(int i = 0;i < row;i++) {
 216             for(int j = 0;j < col;j++) {
 217                 ret[j][i] = (*this)[i][j];
 218             }
 219         }
 220         return ret;
 221     }
 222 
 223     void t_and_mul_to(const Matrix& in, Matrix& out) const {
 224         // out x in
 225         // b x out
 226         // b x in
 227         assert(row == in.col && col == out.col && in.row == out.row);
 228         for(int b = 0;b < in.row;b++) {
 229             for(int j = 0;j < col;j++) {
 230                 out[b][j] = 0;
 231                 for(int i = 0;i < row;i++) {
 232                     out[b][j] += (*this)[i][j] * in[b][i];
 233                 }
 234             }
 235         }
 236     }
 237 
 238     void cast_add_to(Matrix& out) const {
 239         // 1 x out
 240         // b x out
 241         assert(row == 1 && col == out.col);
 242         for(int b = 0;b < out.row;b++) {
 243             for(int j = 0;j < col;j++) {
 244                 out[b][j] += (*this)[0][j];
 245             }
 246         }
 247     }
 248 
 249     void grad(float f, Matrix& delta) {
 250         // 1 x out
 251         // b x out
 252         assert(row == 1 && col == delta.col);
 253         for(int j = 0;j < col;j++) {
 254             float sum = 0;
 255             for(int b = 0;b < delta.row;b++) {
 256                 sum += delta[b][j];
 257             }
 258             sum *= f;
 259             (*this)[0][j] += sum;
 260         }
 261     }
 262 
 263     void grad(float f, const Matrix& delta, const Matrix& prev_a) {
 264         // out x in
 265         // b x out
 266         // b x in
 267         assert(row == delta.col && col == prev_a.col);
 268         for(int i = 0;i < row;i++) {
 269             for(int j = 0;j < col;j++) {
 270                 float sum = 0;
 271                 for(int b = 0;b < delta.row;b++) {
 272                     sum += delta[b][i] * prev_a[b][j];
 273                 }
 274                 sum *= f;
 275                 (*this)[i][j] += sum;
 276             }
 277         }
 278     }
 279 
 280     Matrix to_cpu() const {
 281         Matrix ret(row, col);
 282         memcpy(ret.data, data, size * sizeof(float));
 283         return ret;
 284     }
 285 
 286 #ifdef GPU
 287     Matrix<GPU> to_gpu() const;
 288 #endif
 289 };
 290 
 291 ostream& operator<<(ostream& out, const Matrix<CPU>& v)
 292 {
 293     out << "[(" << v.row << " x " << v.col << ") " << endl;
 294     for(int i = 0;i < v.row;i++) {
 295         out << "\\t";
 296         for(int j = 0;j < v.col;j++) {
 297             if(j > 0) cout << ",";
 298             out << v[i][j];
 299         }
 300         out << endl;
 301     }
 302     out << "]";
 303     return out;
 304 }
 305 
 306 #ifdef GPU
 307 class SmartGPUMem
 308 {
 309 private:
 310     float *m_gpu;
 311 
 312 public:
 313     SmartGPUMem()
 314     {
 315         CK(cudaMalloc((void **)&m_gpu, sizeof *m_gpu));
 316         assert(m_gpu);
 317     }
 318 
 319     float *gpu() { return m_gpu; }
 320     float cpu()
 321     {
 322         float t;
 323         CK(cudaMemcpy(&t, m_gpu, sizeof t, cudaMemcpyDeviceToHost));
 324         return t;
 325     }
 326 
 327     ~SmartGPUMem()
 328     {
 329         CK(cudaFree(m_gpu));
 330     }
 331 };
 332 
 333 __global__ void k_sum(const float *data, size_t row, size_t col, float *out) {
 334     float t = 0;
 335     for(int i = 0;i < row;i++) {
 336         for(int j = 0;j < col;j++) {
 337             t += data[i * col + j];
 338         }
 339     }
 340     *out = t;
 341 }
 342 
 343 __global__ void k_max_idx(const float *data, size_t size, float *out) {
 344     int t = 0;
 345     for(int i = 1;i < size;i++) {
 346         if(data[i] > data[t]) {
 347             t = i;
 348         }
 349     }
 350     *out = t;
 351 }
 352 
 353 __global__ void k_mul_to(const float * w, const float *in, float *out, size_t row, size_t col, size_t batch_size)
 354 {
 355     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 356     int b = gidx / row;
 357     int i = gidx % row;
 358 
 359     if(b >= batch_size) return;
 360 
 361     float t = 0;
 362     for(int j = 0;j < col;j++) {
 363         t += w[i * col + j] * in[b * col + j];
 364     }
 365     out[b * row + i] = t;
 366 }
 367 
 368 __global__ void k_t_and_mul_to(const float *w, const float *in, float *out, size_t row, size_t col, size_t batch_size) 
 369 {
 370     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 371     int b = gidx / col;
 372     int j = gidx % col;
 373 
 374     if(b >= batch_size) return;
 375 
 376     float t = 0;
 377     for(int i = 0;i < row;i++) {
 378         t += w[i * col + j] * in[b * row + i];
 379     }
 380     out[b * col + j] = t;
 381 }
 382 
 383 __global__ void k_cast_add_to(const float* bx, float *out, size_t col, size_t batch_size)
 384 {
 385     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 386     int b = gidx / col;
 387     int j = gidx % col;
 388 
 389     if(b >= batch_size) return;
 390     out[b * col + j] += bx[j];
 391 }
 392 
 393 __global__ void k_grad(float *b, float f, const float *delta, size_t col, size_t batch_size)
 394 {
 395     size_t j = blockDim.x * blockIdx.x + threadIdx.x;
 396     if(j >= col) return;
 397 
 398     float sum = 0;
 399     for(int b = 0;b < batch_size;b++) {
 400         sum += delta[b * col + j];
 401     }
 402     sum *= f;
 403     b[j] += sum;
 404 }
 405 
 406 __global__ void k_grad(float *w, float f, const float *delta, const float *prev_a, size_t row, size_t col, size_t batch_size)
 407 {
 408     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 409     size_t i = gidx / col;
 410     size_t j = gidx % col;
 411     if(i >= row) return;
 412 
 413     float sum = 0;
 414     for(int b = 0;b < batch_size;b++) {
 415         sum += delta[b * row + i] * prev_a[b * col + j];
 416     }
 417     sum *= f;
 418     w[i * col + j] += sum;
 419 }
 420 
 421 template<>
 422 struct Matrix<GPU>
 423 {
 424     size_t row, col, size;
 425     float *data;
 426 
 427     Matrix(size_t _row, size_t _col) :
 428         row(_row), col(_col), size(_row * _col)
 429         {
 430             assert(row > 0 && col > 0);
 431             CK(cudaMalloc((void **)&data, size * sizeof(float)));
 432             assert(data);
 433 
 434             float *tmp = (float*)malloc(size * sizeof(float));
 435             assert(tmp);
 436             memset(tmp, 0, size * sizeof(float));
 437             CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice));
 438             ::free(tmp);
 439         }
 440 
 441     Matrix(size_t _row, size_t _col, float *_data) :
 442         row(_row), col(_col), size(_row * _col), data(_data)
 443         {
 444             assert(row > 0 && col > 0 && data);
 445         }
 446 
 447     Matrix splice(size_t from, size_t to)
 448     {
 449         assert(from < to && to <= row);
 450         size_t offset = from * col;
 451 
 452         Matrix ret(to - from, col, data + offset);
 453         return ret;
 454     }
 455 
 456     string shape() const {
 457         char buf[100];
 458         sprintf(buf, "%d x %d", (int)row, (int)col);
 459         return buf;
 460     }
 461 
 462     void free()
 463     {
 464         assert(data);
 465         CK(cudaFree(data));
 466         data = NULL;
 467     }
 468 
 469     void init(float v)
 470     {
 471         float *tmp = (float*)malloc(size * sizeof(float));
 472         assert(tmp);
 473         for(int i = 0;i < size;i++) {
 474             tmp[i] = v;
 475         }
 476         CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice));
 477         ::free(tmp);
 478     }
 479 
 480     void init_gauss()
 481     {
 482         float *tmp = (float*)malloc(size * sizeof(float));
 483         assert(tmp);
 484         for(int i = 0;i < size;i++) {
 485             tmp[i] = gaussrand_f();
 486         }
 487         CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice));
 488         ::free(tmp);
 489     }
 490 
 491     __device__ float *operator[](size_t i) {
 492         assert(i < row);
 493         return data + i * col;
 494     }
 495 
 496     __device__ const float *operator[] (size_t i) const {
 497         assert(i < row);
 498         return data + i * col;
 499     }
 500 
 501     size_t max_idx() const {
 502         assert(row == 1);
 503         SmartGPUMem mem;
 504         k_max_idx<<<1,1>>>(data, col, mem.gpu());
 505         return mem.cpu();
 506     }
 507 
 508     float sum() const {
 509         SmartGPUMem mem;
 510         k_sum<<<1,1>>>(data, row, col, mem.gpu());
 511         return mem.cpu();
 512     }
 513 
 514     void mul_to(const Matrix& in, Matrix& out) const {
 515         // out x in
 516         // b x in
 517         // b x out
 518         assert(row == out.col && col == in.col && in.row == out.row);
 519         size_t thread_cnt = get_thread_cnt(in.row * row);
 520         size_t block_cnt = ceil(in.row * row, thread_cnt);
 521 
 522         k_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row);
 523 
 524         /*
 525         for(int b = 0;b < in.row;b++) {
 526             for(int i = 0;i < row;i++) {
 527                 out[b][i] = 0;
 528                 for(int j = 0;j < col;j++) {
 529                     out[b][i] += (*this)[i][j] * in[b][j];
 530                 }
 531             }
 532         }
 533         */
 534     }
 535 
 536 
 537     void t_and_mul_to(const Matrix& in, Matrix& out) const {
 538         // out x in
 539         // b x out
 540         // b x in
 541         assert(row == in.col && col == out.col && in.row == out.row);
 542         size_t thread_cnt = get_thread_cnt(in.row * col);
 543         size_t block_cnt = ceil(in.row * col, thread_cnt);
 544 
 545         k_t_and_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row);
 546         /*
 547         for(int b = 0;b < in.row;b++) {
 548             for(int j = 0;j < col;j++) {
 549                 out[b][j] = 0;
 550                 for(int i = 0;i < row;i++) {
 551                     out[b][j] += (*this)[i][j] * in[b][i];
 552                 }
 553             }
 554         }
 555         */
 556     }
 557 
 558     void cast_add_to(Matrix& out) const {
 559         // 1 x out
 560         // b x out
 561         assert(row == 1 && col == out.col);
 562         size_t thread_cnt = get_thread_cnt(out.row * col);
 563         size_t block_cnt = ceil(out.row * col, thread_cnt);
 564 
 565         k_cast_add_to<<<block_cnt, thread_cnt>>>(data, out.data, col, out.row);
 566         /*
 567         for(int b = 0;b < out.row;b++) {
 568             for(int j = 0;j < col;j++) {
 569                 out[b][j] += (*this)[0][j];
 570             }
 571         }
 572         */
 573     }
 574 
 575     void grad(float f, const Matrix& delta) {
 576         // 1 x out
 577         // b x out
 578         assert(row == 1 && col == delta.col);
 579         size_t thread_cnt = get_thread_cnt(col);
 580         size_t block_cnt = ceil(col, thread_cnt);
 581 
 582         k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, col, delta.row);
 583         /*
 584         for(int j = 0;j < col;j++) {
 585             float sum = 0;
 586             for(int b = 0;b < delta.row;b++) {
 587                 sum += delta[b][j];
 588             }
 589             sum *= f;
 590             (*this)[0][j] += sum;
 591         }
 592         */
 593     }
 594 
 595     void grad(float f, const Matrix& delta, const Matrix& prev_a) {
 596         // out x in
 597         // b x out
 598         // b x in
 599         assert(row == delta.col && col == prev_a.col);
 600         size_t thread_cnt = get_thread_cnt(row * col);
 601         size_t block_cnt = ceil(row * col, thread_cnt);
 602 
 603         k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, prev_a.data, row, col, delta.row);
 604         /*
 605         for(int i = 0;i < row;i++) {
 606             for(int j = 0;j < col;j++) {
 607                 float sum = 0;
 608                 for(int b = 0;b < delta.row;b++) {
 609                     sum += delta[b][i] * prev_a[b][j];
 610                 }
 611                 sum *= f;
 612                 (*this)[i][j] += sum;
 613             }
 614         }
 615         */
 616     }
 617 
 618     Matrix<CPU> to_cpu() const {
 619         Matrix<CPU> ret(row, col);
 620         CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyDeviceToHost));
 621         return ret;
 622     }
 623 };
 624 
 625 Matrix<GPU> Matrix<CPU>::to_gpu() const {
 626     Matrix<GPU> ret(row, col);
 627     CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyHostToDevice));
 628     return ret;
 629 }
 630 #endif
 631 
 632 template<size_t IS_GPU>
 633 struct Softmax{};
 634 
 635 __host__ __device__ float softmax_calc(const float *x, const float *y, size_t size)
 636 {
 637     /*
 638       log( exp(x_j) / \\sum exp(x_k) )
 639     = x_j - log \\sum exp(x_k)
 640     = x_j - (max + log \\sum exp(x_k - max))
 641     */
 642     float maxX = x[0];
 643     for(int i = 0;i < size;i++) {
 644         if(x[i] > maxX) {
 645             maxX = x[i];
 646         }
 647     }
 648 
 649     float xSum = 0;
 650     for(int i = 0;i < size;i++) {
 651         xSum += expf(x[i] - maxX);
 652     }
 653 
 654     float ret = 0;
 655     for(int i = 0;i < size;i++) {
 656         ret += y[i] * (x[i] - (maxX + logf(xSum)));
 657     }
 658 
 659     return -ret;
 660 }
 661 
 662 __host__ __device__ void softmax_propagate_delta(const float *x, const float *y, float *z, size_t size, float f)
 663 {
 664     /*
 665       - d \\sum y_j * log( exp(x_j) / \\sum exp(x_k) )
 666     = - d \\sum y_j * x_j - d \\sum y_j log (\\sum exp(x_k) )
 667     = - y_i + \\sum (y_j * exp(x_i) / \\sum exp(x_k))
 668     = - y_i + exp(x_i) (\\sum y_j) / (\\sum exp(x_k))
 669     */
 670 
 671     float maxX = x[0];
 672     for(int i = 0;i < size;i++) {
 673         if(x[i] > maxX) {
 674             maxX = x[i];
 675         }
 676     }
 677 
 678     // y - exp(x) sum_of_y / sum_of_exp(x)
 679     float sumOfY = 0;
 680     float sumOfX = 0;
 681     for(int i = 0;i < size;i++) {
 682         z[i] = expf(x[i] - maxX);
 683         sumOfY += y[i];
 684         sumOfX += z[i];
 685     }
 686 
 687     float yx = sumOfY/sumOfX;
 688     for(int i = 0;i < size;i++) {
 689         z[i] = yx * z[i] - y[i];
 690     }
 691 
 692     for(int i = 0;i < size;i++) {
 693         z[i] *= f;
 694     }
 695 }
 696 
 697 template<>
 698 struct Softmax<CPU>
 699 {
 700     // - \\sum y_j * log( exp(x_j) / \\sum exp(x_k) )
 701     void calc(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& out) const
 702     {
 703         assert(x.row == y.row && x.col == y.col);
 704         assert(out.row == 1 && out.col == x.row);
 705         for(int i = 0;i < x.row;i++) {
 706             out[0][i] = softmax_calc(x[i], y[i], x.col);
 707         }
 708     } 
 709 
 710     void propagate_delta(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& z) const
 711     {
 712         assert(x.row == y.row && x.col == y.col);
 713         assert(x.row == z.row && x.col == z.col);
 714 
 715         for(int i = 0;i < x.row;i++) {
 716             softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row);
 717         }
 718     }
 719 };
 720 
 721 #ifdef GPU
 722 __global__ void k_calc(const float *x, const float *y, float *out, size_t row, size_t col)
 723 {
 724     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 725     if(gidx >= row) return;
 726     size_t i = gidx;
 727 
 728     out[i] = softmax_calc(x + i * col, y + i * col, col);
 729 }
 730 
 731 __global__ void k_propagate_delta(const float *x, const float *y, float *z, size_t row, size_t col, float f)
 732 {
 733     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 734     if(gidx >= row) return;
 735     size_t i = gidx;
 736 
 737     softmax_propagate_delta(x + col * i, y + col * i, z + col * i, col, f);
 738 }
 739 
 740 template<>
 741 struct Softmax<GPU>
 742 {
 743     void calc(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& out) const
 744     {
 745         assert(x.row == y.row && x.col == y.col);
 746         assert(out.row == 1 && out.col == x.row);
 747 
 748         size_t thread_cnt = get_thread_cnt(x.row);
 749         size_t block_cnt = ceil(x.row, thread_cnt);
 750         k_calc<<<block_cnt, thread_cnt>>>(x.data, y.data, out.data, x.row, x.col);
 751         /*
 752         for(int i = 0;i < x.row;i++) {
 753             out[0][i] = softmax_calc(x[i], y[i], x.col);
 754         }
 755         */
 756     }
 757 
 758     void propagate_delta(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& z) const
 759     {
 760         assert(x.row == y.row && x.col == y.col);
 761         assert(x.row == z.row && x.col == z.col);
 762 
 763         size_t thread_cnt = get_thread_cnt(x.row);
 764         size_t block_cnt = ceil(x.row, thread_cnt);
 765         k_propagate_delta<<<block_cnt, thread_cnt>>>(x.data, y.data, z.data, x.row, x.col, 1.0/x.row);
 766         /*
 767         for(int i = 0;i < x.row;i++) {
 768             softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row);
 769         }
 770         */
 771     }
 772 };
 773 #endif
 774 
 775 template<size_t IS_GPU>
 776 struct Relu{};
 777 
 778 template<>
 779 struct Relu<CPU>
 780 {
 781     void forward(const Matrix<CPU>& x, Matrix<CPU> &y) const
 782     {
 783         assert(x.row == y.row && x.col == y.col);
 784         for(int i = 0;i < x.row;i++) {
 785             for(int j = 0;j < x.col;j++) {
 786                 float t = x[i][j];
 787                 y[i][j] = t > 0 ? t : 0;
 788             }
 789         }
 790     }
 791 
 792     void derive_and_dot_into(const Matrix<CPU>& x, Matrix<CPU> &y) const
 793     {
 794         assert(x.row == y.row && x.col == y.col);
 795         for(int i = 0;i < x.row;i++) {
 796             for(int j = 0;j < x.col;j++) {
 797                 float t = x[i][j];
 798                 y[i][j] *= t > 0 ? 1 : 0;
 799             }
 800         }
 801     }
 802 };
 803 
 804 #ifdef GPU
 805 __global__ void k_forward(const float *x, float *y, size_t row, size_t col)
 806 {
 807     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 808     int i = gidx / col;
 809     int j = gidx % col;
 810     if(i >= row) return;
 811 
 812     float t = x[i * col + j];
 813     y[i * col + j] = t > 0 ? t : 0;
 814 }
 815 
 816 __global__ void k_derive_and_dot_into(const float *x, float *y, size_t row, size_t col)
 817 {
 818     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
 819     int i = gidx / col;
 820     int j = gidx % col;
 821     if(i >= row) return;
 822 
 823     float t = x[i * col + j];
 824     y[i * col + j] *= t > 0 ? 1 : 0;
 825 }
 826 
 827 
 828 template<>
 829 struct Relu<GPU>
 830 {
 831     void forward(const Matrix<GPU>& x, Matrix<GPU> &y) const
 832     {
 833         assert(x.row == y.row && x.col == y.col);
 834         size_t thread_cnt = get_thread_cnt(x.row * x.col);
 835         size_t block_cnt = ceil(x.row * x.col, thread_cnt);
 836         k_forward<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col);
 837         /*
 838         for(int i = 0;i < x.row;i++) {
 839             for(int j = 0;j < x.col;j++) {
 840                 float t = x[i][j];
 841                 y[i][j] = t > 0 ? t : 0;
 842             }
 843         }
 844         */
 845     }
 846 
 847     void derive_and_dot_into(const Matrix<GPU>& x, Matrix<GPU> &y) const
 848     {
 849         assert(x.row == y.row && x.col == y.col);
 850         size_t thread_cnt = get_thread_cnt(x.row * x.col);
 851         size_t block_cnt = ceil(x.row * x.col, thread_cnt);
 852         k_derive_and_dot_into<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col);
 853         /*
 854         for(int i = 0;i < x.row;i++) {
 855             for(int j = 0;j < x.col;j++) {
 856                 float t = x[i][j];
 857                 y[i][j] *= t > 0 ? 1 : 0;
 858             }
 859         }
 860         */
 861     }
 862 };
 863 #endif
 864 
 865 template<size_t IS_GPU>
 866 struct Layer
 867 {
 868     size_t in_size, out_size, batch_size;
 869     Matrix<IS_GPU> w, b;
 870     Relu<IS_GPU> relu;
 871 
 872     // z = w * in + b
 873     // a = relu(z)
 874     Matrix<IS_GPU> z, a, delta;
 875 
 876     Layer(size_t _in_size, size_t _out_size, size_t _batch_size=1) :
 877         in_size(_in_size), out_size(_out_size), batch_size(_batch_size),
 878         w(_out_size, _in_size), b(1, _out_size),
 879         z(_batch_size, _out_size), a(_batch_size, _out_size), 
 880         delta(_batch_size, _out_size)
 881     {
 882         assert(in_size > 0);
 883         assert(out_size > 0);
 884         assert(batch_size > 0);
 885         w.init_gauss();
 886         b.init_gauss();
 887     }
 888 
 889     void reset_batch_size(size_t _batch_size) 
 890     {
 891         assert(_batch_size > 0);
 892         if(_batch_size == batch_size) return;
 893         batch_size = _batch_size;
 894 
 895         z.free(); 
 896         a.free(); 
 897         delta.free();
 898         z = Matrix<IS_GPU>(batch_size, out_size);
 899         a = Matrix<IS_GPU>(batch_size, out_size);
 900         delta = Matrix<IS_GPU>(batch_size, out_size);
 901     }
 902 
 903     void calc(const Matrix<IS_GPU>& in)
 904     {
 905         assert(in.row == batch_size);
 906         assert(in.col == in_size);
 907 
 908         // out x in
 909         // batch x in
 910         // batch x out
 911         w.mul_to(in, z);
 912 
 913         // 1 x out
 914         // batch x out
 915         b.cast_add_to(z);
 916 
 917         // batch x out
 918         // batch x out
 919         relu.forward(z, a);
 920     }
 921 
 922     void propagate_delta(Matrix<IS_GPU>& out)
 923     {
 924         assert(out.row == batch_size);
 925         assert(out.col == in_size);
 926 
 927         // out x in
 928         // batch x out
 929         // batch x in
 930         w.t_and_mul_to(delta, out);
 931     }
 932 
 933     void update_parameters(float alpha, const Matrix<IS_GPU> &prev_a)
 934     {
 935         assert(prev_a.row == batch_size);
 936         assert(prev_a.col == in_size);
 937 
 938         // 1 x out
 939         // 1 x out
 940         b.grad(-alpha, delta);
 941 
 942         // out x in
 943         // batch x out
 944         // batch x in
 945         w.grad(-alpha, delta, prev_a);
 946     }
 947 };
 948 
 949 int MsbInt(char buf[])
 950 {
 951     int base = 1;
 952     int ret = 0;
 953     for(int i = 3;i >= 0;i--) {
 954         ret += (unsigned char)buf[i] * base;
 955         base *= 256;
 956     }
 957     return ret;
 958 }
 959 
 960 vector<int> ReadMnistLabels(string fileName)
 961 {
 962     vector<int> ret;
 963     ifstream ifs(fileName.c_str(), ios::binary);
 964     char buf[1000];
 965 
 966     // MAGIC
 967     ifs.read(buf, 4);
 968     int magic = MsbInt(buf);
 969     assert(magic == 0x00000801);
 970 
 971     // num of images
 972     ifs.read(buf, 4);
 973     int nImages = MsbInt(buf);
 974 
 975     while(nImages--) {
 976         ret.push_back(ifs.get());
 977     }
 978 
 979     return ret;
 980 }
 981 
 982 Matrix<CPU> ReadMnistData(const string& fileName)
 983 {
 984     ifstream ifs(fileName.c_str(), ios::binary);
 985     char buf[1000];
 986 
 987     // MAGIC
 988     ifs.read(buf, 4);
 989     int magic = MsbInt(buf);
 990     assert(magic == 0x00000803);
 991 
 992     // num of images
 993     ifs.read(buf, 4);
 994     int nImages = MsbInt(buf);
 995 
 996     int row, col;
 997     ifs.read(buf, 4);
 998     row = MsbInt(buf);
 999     ifs.read(buf, 4);
1000     col = MsbInt(buf);
1001     assert(row == 28 && col == 28);
1002 
1003     size_t size = row * col;
1004 
1005     Matrix<CPU> ret(nImages, size);
1006 
1007     for(int k = 0;k < nImages;k++) {
1008         for(int i = 0;i < row * col;i++) {
1009             ret[k][i] = ifs.get() / 256.0; // 归一化
1010         }
1011     }
1012 
1013     return ret;
1014 }
1015 
1016 Matrix<CPU> translateLabels(const vector<int>& labels, int k=10) 
1017 {
1018     Matrix<CPU> ret(labels.size(), k);
1019 
1020     for(int i = 0;i < labels.size();i++) {
1021         assert(labels[i] >= 0 && labels[i] < k);
1022         ret[i][labels[i]] = 1;
1023     }
1024     return ret;
1025 }
1026 
1027 template<size_t IS_GPU>
1028 void reset_model_batch_size(vector<Layer<IS_GPU> >&model, size_t batch_size)
1029 {
1030     for(int i = 0;i < model.size();i++) {
1031         model[i].reset_batch_size(batch_size);
1032     }
1033 }
1034 
1035 template<size_t IS_GPU>
1036 vector<bool> forward(
1037     vector<Layer<IS_GPU> >& model,
1038     const Matrix<IS_GPU>& in, 
1039     const Matrix<IS_GPU>& label, 
1040     const vector<int>& raw_label, 
1041     const Softmax<IS_GPU>& s,
1042     float& error)
1043 {
1044     size_t batch_size = in.row;
1045     assert(model[0].batch_size == batch_size);
1046     assert(label.row == batch_size);
1047     assert(raw_label.size() == batch_size);
1048 
1049     size_t nLayer = model.size();
1050 
1051     for(int j = 0;j < nLayer;j++) {
1052         Layer<IS_GPU> &layer = model[j];
1053         if(j == 0) {
1054             layer.calc(in);
1055         } else {
1056             layer.calc(model[j-1].a);
1057         }
1058     }
1059 
1060     Layer<IS_GPU> &lastLayer = model[nLayer - 1];
1061     Matrix<IS_GPU> error2(1, batch_size);
1062     s.calc(lastLayer.a, label, error2);
1063     error = error2.sum() / batch_size;
1064     error2.free();
1065 
1066     Matrix<CPU> cpu_a = lastLayer.a.to_cpu();
1067     vector<bool> ret(batch_size);
1068     for(int i = 0;i < batch_size;i++) {
1069         ret[i] = cpu_a.splice(i,i + 1).max_idx() == raw_label[i];
1070     }
1071     cpu_a.free();
1072     return ret;
1073 }
1074 
1075 template<size_t IS_GPU>
1076 void run(Matrix<IS_GPU> &train_data,
1077         Matrix<IS_GPU> &train_label,
1078         vector<int> &raw_train_label,
1079 
1080         Matrix<IS_GPU> &test_data,
1081         Matrix<IS_GPU> &test_label,
1082         vector<int> &raw_test_label,
1083 
1084         vector<Layer<IS_GPU> >& model,
1085         size_t batch_size
1086     )
1087 {
1088     clock_t start = clock();
1089 
1090     size_t M = train_data.row;
1091     size_t T = test_data.row;
1092 
1093     Softmax<IS_GPU> s;
1094     float avg_error = 0;
1095     float error;
1096     for(int i = 0;i < M;i += batch_size) {
1097         size_t this_batch_size = std::min(batch_size, M - i);
1098         reset_model_batch_size(model, this_batch_size);
1099         Matrix<IS_GPU> this_batch_train_data = train_data.splice(i, i + this_batch_size);
1100         Matrix<IS_GPU> this_batch_train_label = train_label.splice(i, i + this_batch_size);
1101         vector<int> this_batch_raw_train_label(raw_train_label.begin() + i, raw_train_label.begin() + i + this_batch_size);
1102 
1103         /*
1104         cout << this_batch_train_data.shape() << endl;
1105         cout << this_batch_train_label.shape() << endl;
1106         cout << this_batch_raw_train_label.size() << endl;
1107         */
1108 
1109         forward<IS_GPU>(
1110             model, 
1111             this_batch_train_data,
1112             this_batch_train_label,
1113             this_batch_raw_train_label,
1114             s,
1115             error
1116             );
1117         avg_error += error * this_batch_size;
1118 
1119         // backward
1120         for(int j = model.size() - 1;j >= 0;j--) {
1121             Layer<IS_GPU> &layer = model[j];
1122             if(j == model.size() - 1) {
1123                 s.propagate_delta(layer.a, this_batch_train_label, layer.delta);
1124             } else {
1125                 model[j + 1].propagate_delta(layer.delta);
1126             }
1127             layer.relu.derive_and_dot_into(layer.a, layer.delta);
1128         }
1129 
1130         for(int j = 0;j < model.size();j++) {
1131             model[j].update_parameters(0.001, j == 0 ? this_batch_train_data : model[j-1].a);
1132         }
1133     }
1134     avg_error /= M;
1135 
1136     clock_t mid = clock();
1137     cout << "\\ttime=" << ((mid-start)*1.0/CLOCKS_PER_SEC) << " error=" << avg_error << endl;
1138 
1139     // 测试
1140     reset_model_batch_size(model, M);
1141     vector<bool> is_good = forward(model, 
1142         train_data.splice(0, M), 
1143         train_label.splice(0, M), 
1144         vector<int>(raw_train_label.begin(), raw_train_label.begin() + M),
1145         s, error);
1146     size_t total = 0, good = 0;
1147     for(int i = 0;i < M;i++) {
1148         if(is_good[i]) good++;
1149         total++;
1150     }
1151     cout << "\\ttrain_accuracy=" << (good*1.0/total) << " ";
1152 
1153     total = good = 0;
1154     reset_model_batch_size(model, T);
1155     is_good = forward(model, 
1156         test_data.splice(0, T),
1157         test_label.splice(0, T),
1158         vector<int>(raw_test_label.begin(), raw_test_label.begin() + T),
1159         s, error);
1160     for(int i = 0;i < T;i++) {
1161         if(is_good[i]) good++;
1162         total++;
1163     }
1164     cout << "test_accuracy=" << (good*1.0/total) << " ";
1165 
1166     clock_t end = clock();
1167     cout << "total_time=" << (end-start)*1.0/CLOCKS_PER_SEC << endl;
1168 }
1169 
1170 void test()
1171 {
1172     /*
1173     0 1
1174     1 2
1175     2 3
1176     */
1177     Matrix<CPU> a(3, 2);
1178     for(int i = 0;i < a.row;i++) {
1179         for(int j = 0;j < a.col;j++) {
1180             a[i][j] = i + j;
1181         }
1182     }
1183     //Matrix splice(size_t from, size_t to) {
1184     Matrix<CPU> t = a.splice(1,2);
1185     assert(t.row == 1 && t.col == 2 && t[0][0] == 1);
1186 
1187     //string shape() const {
1188     assert(a.shape() == "3 x 2");
1189 
1190     //void free() {
1191     t = Matrix<CPU>(1,1);
1192     t.free();
1193     assert(!t.data);
1194 
1195     //void init(float v) {
1196     t = Matrix<CPU>(2,2);
1197     t.init(2);
1198     assert(t[1][1] == 2);
1199 
1200     //void init_gauss() {
1201     t = Matrix<CPU>(2,2);
1202     assert(t[1][1] == 0);
1203     t.init_gauss();
1204     assert(t[1][1] != 0);
1205 
1206     //float *operator[](size_t i) {
1207     //const float *operator[] (size_t i) const {
1208 
1209     //void assert_eq(const Matrix& rhs)
1210     t = Matrix<CPU>(2,2);
1211     Matrix<CPU> t2(2,2);
1212     t.assert_eq(t2);
1213 
1214     //size_t max_idx() const {
1215     assert(a.splice(0,1).max_idx() == 1);
1216 
1217     //float sum() const {
1218     assert(a.splice(1,2).sum() == 3);
1219 
1220     //void mul_to(const Matrix& in, Matrix& out) const {
1221     /*
1222     0 1
1223     1 2
1224     2 3
1225 
1226      x
1227 
1228     1 2
1229     3 4
1230 
1231      =
1232 
1233     2 5 8
1234     4 11 18
1235     */
1236     Matrix<CPU> b(2, 2);
1237     b[0][0] = 1; b[0][1] = 2;
1238     b[1][0] = 3; b[1][1] = 4;
1239     Matrix<CPU> out(2, 3);
1240     a.mul_to(b, out);
1241     assert(out[0][0] == 2 && out[0][1] == 5 && out[0][2] == 8);
1242     assert(out[1][0] == 4 && out[1][1] == 11 && out[1][2] == 18);
1243 
1244 
1245     //void t_and_mul_to(const Matrix& in, Matrix& out) const {
1246     /*
1247     0 1
1248     1 2 -> 0 1 2
1249     2 3    1 2 3
1250 
1251      x
1252 
1253     1 2 3
1254     4 5 6
1255 
1256      =
1257 
1258     8 14
1259     17 32
1260     */
1261     //Matrix<CPU> ap = a.t();
1262     b = Matrix<CPU>(2, 3);
1263     b[0][0] = 1; b[0][1] = 2; b[0][2] = 3;
1264     b[1][0] = 4; b[1][1] = 5; b[1][2] = 6;
1265     out = Matrix<CPU>(2, 2);
1266     a.t_and_mul_to(b, out);
1267     //cout << out << endl;
1268     //Matrix<CPU> out2(2, 2);
1269     //ap.mul_to(b, out2);
1270     //cout << out2 << endl;
1271     //out.assert_eq(out2);
1272     assert(out[0][0] == 8 && out[0][1] == 14);
1273     assert(out[1][0] == 17 && out[1][1] == 32);
1274 
1275     //void cast_add_to(Matrix& out) const {
1276     /*
1277     1 2
1278     +
1279     1 2
1280     3 4
1281     =
1282     2 4
1283     4 6
1284     */
1285     b = Matrix<CPU>(1, 2);
1286     b[0][0] = 1; b[0][1] = 2;
1287     out = Matrix<CPU>(2,2);
1288     out[0][0] = 1; out[0][1] = 2;
1289     out[1][0] = 3; out[1][1] = 4;
1290     b.cast_add_to(out);
1291     assert(out[0][0] == 2 && out[0][1] == 4);
1292     assert(out[1][0] == 4 && out[1][1] == 6);
1293 
1294     //void grad(float f, Matrix& delta) {
1295     /*
1296     1 2
1297     grad with f = -0.5
1298     1 2
1299     3 4
1300     =
1301     -1 -1
1302     */
1303     b = Matrix<CPU>(1, 2);
1304     b[0][0] = 1; b[0][1] = 2;
1305     out = Matrix<CPU>(2,2);
1306     out[0][0] = 1; out[0][1] = 2;
1307     out[1][0] = 3; out[1][1] = 4;
1308     b.grad(-0.5, out);
1309     assert(b[0][0] == -1 && b[0][1] == -1);
1310 
1311     //void grad(float f, const Matrix& delta, const Matrix& prev_a) {
1312     /*
1313     0 1
1314     1 2
1315     2 3
1316 
1317     grad with f = -0.5
1318 
1319     delta=
1320     1 2 3 -> 1
1321     4 5 6    2
1322              3
1323 
1324     prev_a=
1325     7 8 -> 7
1326     9 10   8
1327 
1328     =
1329 
1330     [0][1] = 1 - 0.5 * (1 * 8 + 4 * 10) = 1 - 0.5 * 48 = 1 - 24 = -23
1331     */
1332     Matrix<CPU> w(3, 2);
1333     w[0][0] = 0; w[0][1] = 1;
1334     w[1][0] = 1; w[1][1] = 2;
1335     w[2][0] = 2; w[2][1] = 3;
1336 
1337     Matrix<CPU> delta(2, 3);
1338     delta[0][0] = 1; delta[0][1] = 2; delta[0][2] = 3;
1339     delta[1][0] = 4; delta[1][1] = 5; delta[1][2] = 6;
1340 
1341     Matrix<CPU> prev_a(2, 2);
1342     prev_a[0][0] = 7; prev_a[0][1] = 8;
1343     prev_a[1][0] = 9; prev_a[1][1] = 10;
1344 
1345     w.grad(-0.5, delta, prev_a);
1346     //cout << w << endl;
1347     assert(w[0][1] == -23);
1348 
1349     //Matrix to_cpu() const {
1350     t = a.to_cpu();
1351     t.assert_eq(a);
1352 }
1353 
1354 int main()
1355 {
1356     cout << "Testing" << endl;
1357     test();
1358     //return 0;
1359 
1360     cout << "Loading data" << endl;
1361     // 读取数据
1362     vector<int> raw_train_label = ReadMnistLabels("mnist/train-labels-idx1-ubyte");
1363     //assert(raw_train_label.size() == 60000);
1364     Matrix<CPU> cpu_train_data = ReadMnistData("mnist/train-images-idx3-ubyte");
1365     //assert(cpu_train_data.row == 60000 && cpu_train_data.col == 28 * 28);
1366     Matrix<CPU> cpu_train_label = translateLabels(raw_train_label);
1367     //assert(cpu_train_label.row == 60000 && cpu_train_label.col == 10);
1368 #ifdef GPU
1369     Matrix<GPU> gpu_train_data(cpu_train_data.to_gpu());
1370     Matrix<GPU> gpu_train_label(cpu_train_label.to_gpu());
1371 #endif
1372 
1373 
1374     vector<int> raw_test_label = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");
1375     //assert(raw_test_label.size() == 10000);
1376     Matrix<CPU> cpu_test_data = ReadMnistData("mnist/t10k-images-idx3-ubyte");
1377     //assert(cpu_test_data.row == 10000 && cpu_test_data.col == 28 * 28);
1378     Matrix<CPU> cpu_test_label = translateLabels(raw_test_label);
1379     //assert(cpu_test_label.row == 10000 && cpu_test_label.col == 10);
1380 #ifdef GPU
1381     Matrix<GPU> gpu_test_data(cpu_test_data.to_gpu());
1382     Matrix<GPU> gpu_test_label(cpu_test_label.to_gpu());
1383 #endif
1384 
1385     size_t n_input = cpu_train_data.col;
1386     size_t n_output = 10;
1387     size_t n_mid = 100;
1388     size_t batch_size = 256;
1389 
1390     cout << "Now run" << endl;
1391     srand(1000);
1392     vector<Layer<CPU> > cpu_model;
1393     cpu_model.push_back(Layer<CPU>(n_input, n_mid));
1394     cpu_model.push_back(Layer<CPU>(n_mid, n_output));
1395 #ifdef GPU
1396     srand(1000);
1397     vector<Layer<GPU> > gpu_model;
1398     gpu_model.push_back(Layer<GPU>(n_input, n_mid));
1399     gpu_model.push_back(Layer<GPU>(n_mid, n_output));
1400 #endif
1401 
1402     for(int i = 0; i < 5;i++) {
1403         cout << "cpu-" << (i+1) << endl;
1404         run<CPU>(cpu_train_data, cpu_train_label, raw_train_label, cpu_test_data, cpu_test_label, raw_test_label, cpu_model, batch_size);
1405         #ifdef GPU
1406         cout << "gpu-" << (i+1) << endl;
1407         run<GPU>(gpu_train_data, gpu_train_label, raw_train_label, gpu_test_data, gpu_test_label, raw_test_label, gpu_model, batch_size);
1408         #endif
1409     }
1410 
1411     cout << "Done" << endl;
1412     return 0;
1413 }

 

以上是关于CUDA跑MNIST,加速的主要内容,如果未能解决你的问题,请参考以下文章

[MNIST03]GPU加速和过程参数保存

以前听老人说在n卡上想跑起来opencl程序必须先安装cuda,我没装cuda只安装了nvidia的显卡驱动,结果配置

[MNIST04]学习率和模型初调

Windows caffe 跑mnist实例

gtx 1660 的cuda计算能力是多少

手写数字识别——基于全连接层和MNIST数据集