简单直接的CUDA改造
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了简单直接的CUDA改造相关的知识,希望对你有一定的参考价值。
把前一篇中的MNIST数据识别程序进行了简单的CUDA改造,得到的结果很差,一个epoch从大约5秒变成了50秒。
也可以理解,我把每个操作(比如mul, add)单独拎出来实现,结果由于不知道自己的上下文,就不要不断的 __syncthreads。
不过还是有一些收获,第一次写CUDA程序,最终还是基本保证了程序的正确性。
// 修正: 5秒和50秒的对比有错,因为两个网络的结构不一样,把之前的网络改成和CUDA程序一样的网络之后,时间变成了 30 vs 50,CUDA还是慢一些。
1 #include <iostream> 2 #include <cstdlib> 3 #include <cassert> 4 #include <string> 5 #include <cstring> 6 #include <fstream> 7 #include <vector> 8 #include <memory> 9 #include <cstdlib> 10 #include <cuda_runtime.h> 11 #include <math_functions.h> 12 #include <cmath> 13 #include <ctime> 14 using namespace std; 15 16 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo) 17 { 18 if(code == cudaSuccess) return; 19 cerr << "Cuda call failed at " << fileName << ":" << lineNo 20 << " " << cudaGetErrorString(code) << endl; 21 exit(-1); 22 } 23 24 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__) 25 26 // 为了简单,只用一个BLOCK 27 #define BSIZE (blockDim.x) 28 #define TIDX (threadIdx.x) 29 30 bool InitCUDA() 31 { 32 int count; 33 cudaGetDeviceCount(&count); 34 if(count == 0) { 35 cerr << "There is no cuda device" << endl; 36 return false; 37 } 38 cout << "Toal " << count << " cuda devices" << endl; 39 40 int i; 41 for(i = 0;i < count;i++) { 42 cudaDeviceProp prop; 43 if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { 44 if(prop.major >= 1) { 45 break; 46 } 47 } 48 } 49 50 if(i == count) { 51 cerr << "There is no device supporting CUDA 1.x" << endl; 52 return false; 53 } 54 55 cudaSetDevice(i); 56 return true; 57 } 58 59 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html 60 // 高斯分布的随机数,均值为0,方差为1 61 double gaussrand() 62 { 63 static double V1, V2, S; 64 static int phase = 0; 65 double X; 66 67 if ( phase == 0 ) { 68 do { 69 double U1 = (double)rand() / RAND_MAX; 70 double U2 = (double)rand() / RAND_MAX; 71 72 V1 = 2 * U1 - 1; 73 V2 = 2 * U2 - 1; 74 S = V1 * V1 + V2 * V2; 75 } while(S >= 1 || S == 0); 76 77 X = V1 * sqrt(-2 * log(S) / S); 78 } else 79 X = V2 * sqrt(-2 * log(S) / S); 80 81 phase = 1 - phase; 82 83 return X; 84 } 85 86 #define ALIGN_FLOAT(x) (((x) + 3) & (~3)) 87 88 template<size_t ROW, size_t COL> 89 struct Matrix 90 { 91 const static int row = ROW; 92 const static int col = COL; 93 float data[ROW][ALIGN_FLOAT(COL)]; 94 __device__ __host__ inline float* operator[](size_t x) 95 { 96 assert(x < ROW); 97 return data[x]; 98 } 99 }; 100 101 template<size_t SIZE> 102 struct Vector 103 { 104 const static int size = SIZE; 105 float data[SIZE]; 106 __device__ __host__ inline float &operator[](size_t x) 107 { 108 assert(x < SIZE); 109 return data[x]; 110 } 111 }; 112 113 template<size_t SIZE> 114 ostream& operator<<(ostream& out, Vector<SIZE>& v) 115 { 116 out << "[(" << v.size << ") "; 117 for(int i = 0;i < v.size;i++) { 118 if(i > 0) out << ","; 119 out << v[i]; 120 } 121 out << "]"; 122 return out; 123 } 124 125 // 矩阵乘法 126 template<size_t ROW, size_t COL> 127 __device__ inline void mul(Matrix<ROW,COL> &w, Vector<COL> &x, Vector<ROW> &out) 128 { 129 /* 130 for(int i = 0;i < w.row;i++) { 131 out[i] = 0; 132 for(int j = 0;j < w.col;j++) { 133 out[i] += w[i][j] * x[j]; 134 } 135 } 136 */ 137 for(int i = TIDX;i < w.row;i += BSIZE) { 138 out[i] = 0; 139 for(int j = 0;j < w.col;j++) { 140 out[i] += w[i][j] * x[j]; 141 } 142 } 143 __syncthreads(); 144 } 145 146 // 向量点乘 147 template<size_t SIZE> 148 __device__ inline void dot(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out) 149 { 150 /* 151 for(int i = 0;i < x.size;i++) { 152 out[i] = x[i] * y[i]; 153 } 154 */ 155 for(int i = TIDX;i < x.size;i += BSIZE) { 156 out[i] = x[i] * y[i]; 157 } 158 __syncthreads(); 159 } 160 161 // w转置,然后和v相乘 162 template<size_t ROW, size_t COL> 163 __device__ inline void t_and_mul(Matrix<ROW, COL> &w, Vector<ROW> &v, Vector<COL> &out) 164 { 165 /* 166 for(int i = 0;i < w.col;i++) { 167 out[i] = 0; 168 for(int j = 0;j < w.row;j++) { 169 out[i] += w[j][i] * v[j]; 170 } 171 } 172 */ 173 for(int i = TIDX;i < w.col;i++) { 174 out[i] = 0; 175 for(int j = 0;j < w.row;j++) { 176 out[i] += w[j][i] * v[j]; 177 } 178 } 179 __syncthreads(); 180 } 181 182 template<size_t SIZE> 183 __device__ inline void add(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out) 184 { 185 /* 186 for(int i = 0;i < x.size;i++) { 187 out[i] = x[i] + y[i]; 188 } 189 */ 190 for(int i = TIDX;i < x.size;i += BSIZE) { 191 out[i] = x[i] + y[i]; 192 } 193 __syncthreads(); 194 } 195 196 template<size_t SIZE> 197 __device__ inline void sub(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out) 198 { 199 /* 200 for(int i = 0;i < x.size;i++) { 201 out[i] = x[i] - y[i]; 202 } 203 */ 204 for(int i = TIDX;i < x.size;i += BSIZE) { 205 out[i] = x[i] - y[i]; 206 } 207 __syncthreads(); 208 } 209 210 template<size_t SIZE> 211 __device__ inline void mul(float x, Vector<SIZE> &y, Vector<SIZE> &out) 212 { 213 /* 214 for(int i = 0;i < y.size;i++) { 215 out[i] = x * y[i]; 216 } 217 */ 218 for(int i = TIDX;i < y.size;i += BSIZE) { 219 out[i] = x * y[i]; 220 } 221 __syncthreads(); 222 } 223 224 template<size_t SIZE> 225 __device__ inline void mul(Vector<SIZE> &x, float y, Vector<SIZE> &out) 226 { 227 mul(y, x, out); 228 } 229 230 template<size_t SIZE> 231 __device__ inline void copy(Vector<SIZE> &x, Vector<SIZE> &out) 232 { 233 /* 234 for(int i = 0;i < x.size;i++) { 235 out[i] = x[i]; 236 } 237 */ 238 for(int i = TIDX;i < x.size;i += BSIZE) { 239 out[i] = x[i]; 240 } 241 __syncthreads(); 242 } 243 244 __device__ inline float sum_of_shared(int size) 245 { 246 extern __shared__ float shared[]; 247 248 __syncthreads(); 249 250 if(TIDX == 0) { 251 for(int i = 1;i < size;i++) { 252 shared[0] += shared[i]; 253 } 254 } 255 __syncthreads(); 256 257 float ret = shared[0]; 258 259 __syncthreads(); 260 261 return ret; 262 } 263 264 __device__ inline float max_of_shared(int size) 265 { 266 extern __shared__ float shared[]; 267 268 __syncthreads(); 269 270 if(TIDX == 0) { 271 for(int i = 1;i < size;i++) { 272 if(shared[0] < shared[i]) { 273 shared[0] = shared[i]; 274 } 275 } 276 } 277 __syncthreads(); 278 279 float ret = shared[0]; 280 281 __syncthreads(); 282 283 return ret; 284 } 285 286 287 template<size_t SIZE> 288 __device__ inline float max(Vector<SIZE>& x) 289 { 290 assert(x.size > 0); 291 292 extern __shared__ float shared[]; 293 294 if(TIDX < x.size) { 295 shared[TIDX] = x[TIDX]; 296 } 297 298 for(int i = TIDX + BSIZE;i < x.size;i += BSIZE) { 299 if(shared[TIDX] < x[i]) { 300 shared[TIDX] = x[i]; 301 } 302 } 303 304 return max_of_shared(min(BSIZE, x.size)); 305 } 306 307 template<size_t SIZE> 308 __device__ inline float sum(Vector<SIZE>& x) 309 { 310 assert(x.size > 0); 311 312 extern __shared__ float shared[]; 313 314 if(TIDX < x.size) { 315 shared[TIDX] = 0; 316 } 317 318 for(int i = TIDX;i < x.size;i += BSIZE) { 319 shared[TIDX] += x[i]; 320 } 321 322 return sum_of_shared(min(BSIZE, x.size)); 323 } 324 325 template<size_t SIZE> 326 __device__ inline void add_with_xs(Vector<SIZE> &x, float xs, Vector<SIZE> &y, Vector<SIZE> &out) 327 { 328 /* 329 for(int i = 0;i < x.size;i++) { 330 out[i] = x[i] + xs * y[i]; 331 } 332 */ 333 for(int i = TIDX;i < x.size;i += BSIZE) { 334 out[i] = x[i] + xs * y[i]; 335 } 336 337 __syncthreads(); 338 } 339 340 template<size_t SIZE> 341 struct Softmax 342 { 343 __device__ static inline float calc(Vector<SIZE>& x, Vector<SIZE>& y) 344 { 345 // - \\sum y_j * log( exp(x_j) / \\sum exp(x_k) ) 346 /* 347 log( exp(x_j) / \\sum exp(x_k) ) 348 = x_j - log \\sum exp(x_k) 349 = x_j - (max + log \\sum exp(x_k - max)) 350 */ 351 352 float maxX = max(x); 353 354 /* 355 float xSum = 0; 356 for(int i = 0;i < x.size;i++) { 357 xSum += expf(x[i] - maxX); 358 } 359 */ 360 361 extern __shared__ float shared[]; 362 if(TIDX < x.size) { 363 shared[TIDX] = 0; 364 } 365 366 for(int i = TIDX;i < x.size;i += BSIZE) { 367 shared[TIDX] += expf(x[i] - maxX); 368 } 369 if(TIDX == 0) { 370 for(int i = 1;i < BSIZE && i < x.size;i++) { 371 shared[0] += shared[i]; 372 } 373 } 374 375 float xSum = sum_of_shared(min(BSIZE, x.size)); 376 377 /* 378 float ret = 0; 379 for(int i = 0;i < x.size;i++) { 380 ret += y[i] * (x[i] - (maxX + logf(xSum))); 381 } 382 */ 383 if(TIDX < x.size) { 384 shared[TIDX] = 0; 385 } 386 for(int i = TIDX;i < x.size;i += BSIZE) { 387 shared[i] += y[i] * (x[i] - (maxX + logf(xSum))); 388 } 389 390 float ret = sum_of_shared(min(BSIZE, x.size)); 391 392 return -ret; 393 } 394 395 396 static inline float calc_host(Vector<SIZE>& x, Vector<SIZE>& y) 397 { 398 // - \\sum y_j * log( exp(x_j) / \\sum exp(x_k) ) 399 /* 400 log( exp(x_j) / \\sum exp(x_k) ) 401 = x_j - log \\sum exp(x_k) 402 = x_j - (max + log \\sum exp(x_k - max)) 403 */ 404 405 float maxX = x[0]; 406 for(int i = 1;i < x.size;i++) { 407 if(x[i] > maxX) { 408 maxX = x[i]; 409 } 410 } 411 412 float xSum = 0; 413 for(int i = 0;i < x.size;i++) { 414 xSum += expf(x[i] - maxX); 415 } 416 417 float ret = 0; 418 for(int i = 0;i < x.size;i++) { 419 ret += y[i] * (x[i] - (maxX + logf(xSum))); 420 } 421 422 return -ret; 423 } 424 425 __device__ static inline void propagate_delta(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out) 426 { 427 /* 428 - d \\sum y_j * log( exp(x_j) / \\sum exp(x_k) ) 429 = - d \\sum y_j * x_j - d \\sum y_j log (\\sum exp(x_k) ) 430 = - y_i + \\sum (y_j * exp(x_i) / \\sum exp(x_k)) 431 = - y_i + exp(x_i) (\\sum y_j) / (\\sum exp(x_k)) 432 */ 433 434 float maxX = max(x); 435 436 // -y + exp(x) sum_of_y / sum_of_exp(x) 437 438 /* 439 float sumOfY = 0; 440 float sumOfX = 0; 441 442 for(int i = 0;i < x.size;i++) { 443 out[i] = expf(x[i] - maxX); 444 sumOfY += y[i]; 445 sumOfX += out[i]; 446 } 447 */ 448 449 for(int i = TIDX;i < x.size;i += BSIZE) { 450 out[i] = expf(x[i] - maxX); 451 } 452 453 float sumOfY = sum(y); 454 float sumOfX = sum(out); 455 456 float t = sumOfY/sumOfX; 457 458 mul(t, out, out); 459 sub(out, y, out); 460 } 461 }; 462 463 template<size_t SIZE> 464 struct Relu 465 { 466 __device__ static inline void forward(Vector<SIZE> &x, Vector<SIZE> &out) 467 { 468 /* 469 for(int i = 0;i < x.size;i++) { 470 out[i] = x[i] >= 0 ? x[i] : 0; 471 } 472 */ 473 for(int i = TIDX;i < x.size;i += BSIZE) { 474 out[i] = x[i] >= 0 ? x[i] : 0; 475 } 476 __syncthreads(); 477 } 478 479 __device__ static inline void derive(Vector<SIZE> &x, Vector<SIZE> &out) 480 { 481 /* 482 for(int i = 0;i < x.size;i++) { 483 out[i] = x[i] >= 0 ? 1 : 0; 484 } 485 */ 486 for(int i = TIDX;i < x.size;i += BSIZE) { 487 out[i] = x[i] >= 0 ? 1 : 0; 488 } 489 __syncthreads(); 490 } 491 492 __device__ static inline void derive_and_dot_into(Vector<SIZE> &x, Vector<SIZE> &out) 493 { 494 // out = dot(dx, out) 495 /* 496 for(int i = 0;i < x.size;i++) { 497 out[i] = out[i] * (x[i] >= 0 ? 1 : 0); 498 } 499 */ 500 for(int i = TIDX;i < x.size;i += BSIZE) { 501 out[i] = out[i] * (x[i] >= 0 ? 1 : 0); 502 } 503 __syncthreads(); 504 } 505 }; 506 507 // NN的一层 508 // 1. 输入不算一层 509 // 2. 层的w矩阵是从前面一层到当前层的w,和NG的定义有些出入 510 // 3. 层的b是前面一层到当前层的b,和NG的定义有些出入 511 template <size_t IN_SIZE, size_t OUT_SIZE, typename Activator=Relu<OUT_SIZE> > 512 struct Layer 513 { 514 typedef Activator ActivatorType; 515 516 // 上一层的输出的个数,不包括bias 517 const static int inSize = IN_SIZE; 518 // 当前层的输出 519 const static int outSize = OUT_SIZE; 520 521 Matrix<OUT_SIZE, IN_SIZE> w; 522 Vector<OUT_SIZE> b; 523 // 最后一次forward计算之后保存的激活值 524 Vector<OUT_SIZE> a; 525 Vector<OUT_SIZE> z; 526 // 最后一次反向传播计算之后保存的delta值 527 Vector<OUT_SIZE> delta; 528 529 void initWeights() 530 { 531 for(int i = 0;i < b.size;i++) { 532 b[i] = float(gaussrand() * 0.01); 533 } 534 for(int i = 0;i < w.row;i++) { 535 for(int j = 0;j < w.col;j++) { 536 w[i][j] = float(gaussrand() * 0.01); 537 } 538 } 539 } 540 541 Layer() 542 { 543 initWeights(); 544 } 545 546 __device__ inline void calc(Vector<IN_SIZE> &in) 547 { 548 // w * in + b 549 mul(w, in, z); 550 add(z, b, z); 551 Activator::forward(z, a); 552 } 553 554 __device__ inline void propagate_delta(Vector<IN_SIZE> &out) 555 { 556 t_and_mul(w, delta, out); 557 } 558 559 // alpha是学习率 560 // prevA是上一层的输出 561 __device__ inline void update_parameters(float alpha, Vector <IN_SIZE> &prevA) 562 { 563 // b = b - alpha * delta 564 add_with_xs(b, -alpha, delta, b); 565 566 /* 567 for(int i = 0;i < w.row;i++) { 568 for(int j = 0;j < w.col;j++) { 569 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i]; 570 } 571 } 572 */ 573 for(int i = TIDX;i < w.row;i += BSIZE) { 574 for(int j = 0;j < w.col;j++) { 575 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i]; 576 } 577 } 578 __syncthreads(); 579 } 580 }; 581 582 // model 583 #define INPUT_SIZE (28 * 28) 584 #define OUTPUT_SIZE 10 585 586 typedef Layer<INPUT_SIZE, 100, Relu<100> > L1_TYPE; 587 588 typedef Layer<100, OUTPUT_SIZE, Relu<OUTPUT_SIZE> > L2_TYPE; 589 590 #define LN_TYPE L2_TYPE 591 #define LN L2 592 593 __global__ void forward( 594 L1_TYPE &L1, L2_TYPE &L2, 595 Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &output) 596 { 597 L1.calc(input); 598 599 L2.calc(L1.a); 600 601 copy(L2.a, output); 602 } 603 604 template<typename CostFun> 605 __global__ void backward( 606 L1_TYPE &L1, L2_TYPE &L2, 607 Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &y, float alpha) 608 { 609 // 最后一层 610 CostFun::propagate_delta(LN.a, y, LN.delta); 611 LN_TYPE::ActivatorType::derive_and_dot_into(LN.a, LN.delta); 612 613 // 其它层 614 L2.propagate_delta(L1.delta); 615 L1_TYPE::ActivatorType::derive_and_dot_into(L1.a, L1.delta); 616 617 // 更新所有的w和b 618 L1.update_parameters(alpha, input); 619 L2.update_parameters(alpha, L1.a); 620 } 621 622 // 读取一个整数 623 int MsbInt(char buf[], int len=4) 624 { 625 int base = 1; 626 int ret = 0; 627 for(int i = len - 1;i >= 0;i--) { 628 ret += (unsigned char)buf[i] * base; 629 base *= 256; 630 } 631 return ret; 632 } 633 634 vector<int> ReadMnistLabels(string fileName) 635 { 636 vector<int> ret; 637 ifstream ifs(fileName.c_str(), ios::binary); 638 char buf[1000]; 639 640 // MAGIC 641 ifs.read(buf, 4); 642 int magic = MsbInt(buf); 643 if(magic != 0x00000801) { 644 cerr << "incorrect label file magic number" << endl; 645 } 646 647 // num of images 648 ifs.read(buf, 4); 649 int nImages = MsbInt(buf); 650 651 while(nImages--) { 652 ret.push_back(ifs.get()); 653 } 654 655 return ret; 656 } 657 658 Vector<INPUT_SIZE> * ReadMnistData(string fileName) 659 { 660 ifstream ifs(fileName.c_str(), ios::binary); 661 char buf[1000]; 662 663 // MAGIC 664 ifs.read(buf, 4); 665 int magic = MsbInt(buf); 666 if(magic != 0x00000803) { 667 cerr << "incorrect data file magic number" << endl; 668 } 669 670 // num of images 671 ifs.read(buf, 4); 672 int nImages = MsbInt(buf); 673 Vector<INPUT_SIZE> * ret = new Vector<INPUT_SIZE>[nImages]; 674 675 int row, col; 676 ifs.read(buf, 4); 677 row = MsbInt(buf); 678 ifs.read(buf, 4); 679 col = MsbInt(buf); 680 if(row * col != INPUT_SIZE) { 681 cerr << "incorrect image size" << endl; 682 } 683 684 for(int k = 0;k < nImages;k++) { 685 Vector<INPUT_SIZE> &image = ret[k]; 686 for(int i = 0;i < row * col;i++) { 687 image[i] = ifs.get() / 256.0; // 归一化 688 } 689 } 690 691 return ret; 692 } 693 694 Vector<OUTPUT_SIZE>* translateLabels(vector<int> &labels, int k=10) 695 { 696 int n = labels.size(); 697 Vector<OUTPUT_SIZE> * ret = new Vector<OUTPUT_SIZE>[n]; 698 699 for(int i = 0;i < labels.size();i++) { 700 Vector<OUTPUT_SIZE> &tmp = ret[i]; 701 memset(&tmp, 0, sizeof(tmp)); 702 assert(labels[i] >= 0 && labels[i] < k); 703 tmp[labels[i]] = 1; 704 } 705 return ret; 706 } 707 708 int getMaxIdx(Vector<OUTPUT_SIZE>& x) 709 { 710 int maxIdx = 0; 711 float maxV = x[0]; 712 for(int i = 0;i < x.size;i++) { 713 if(x[i] > maxV) { 714 maxV = x[i]; 715 maxIdx = i; 716 } 717 } 718 return maxIdx; 719 } 720 721 template <typename T> 722 void CUDA_ALLOC_AND_COPY(T *&to, T *from, size_t size) 723 { 724 CK(cudaMalloc((void**)&to, size)); 725 CK(cudaMemcpy(to, from, size, cudaMemcpyHostToDevice)); 726 } 727 728 int main() 729 { 730 srand(1000); 731 732 if(!InitCUDA()) { 733 return -1; 734 } 735 736 L1_TYPE *tmpL1 = new L1_TYPE(), *L1; 737 CUDA_ALLOC_AND_COPY(L1, tmpL1, sizeof(*L1)); 738 delete tmpL1; 739 740 L2_TYPE *tmpL2 = new L2_TYPE(), *L2; 741 CUDA_ALLOC_AND_COPY(L2, tmpL2, sizeof(*L2)); 742 delete tmpL2; 743 744 cout << "Loading data" << endl; 745 746 // 读取数据 747 vector<int> trainLabels = ReadMnistLabels("mnist/train-labels-idx1-ubyte"); 748 int nTrain = trainLabels.size(); 749 Vector<OUTPUT_SIZE>* trainLabels2 = translateLabels(trainLabels); 750 Vector<OUTPUT_SIZE>* trainLabels2OnGpu; 751 CUDA_ALLOC_AND_COPY(trainLabels2OnGpu, trainLabels2, sizeof(trainLabels2[0]) * nTrain); 752 753 Vector<INPUT_SIZE>* trainData = ReadMnistData("mnist/train-images-idx3-ubyte"); 754 Vector<INPUT_SIZE>* trainDataOnGpu; 755 CUDA_ALLOC_AND_COPY(trainDataOnGpu, trainData, sizeof(trainData[0]) * nTrain); 756 757 vector<int> testLabels = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte"); 758 int nTest = testLabels.size(); 759 760 Vector<INPUT_SIZE>* testData = ReadMnistData("mnist/t10k-images-idx3-ubyte"); 761 Vector<INPUT_SIZE>* testDataOnGpu; 762 CUDA_ALLOC_AND_COPY(testDataOnGpu, testData, sizeof(testData[0]) * nTest); 763 764 int M = nTrain; 765 int T = nTest; 766 767 typedef Softmax<OUTPUT_SIZE> CostFun; 768 769 // 开始训练 770 cout << "Start training" << endl; 771 clock_t fullStartedAt = clock(); 772 773 Vector<OUTPUT_SIZE> *outputOnCuda; 774 CK(cudaMalloc((void**)&outputOnCuda, sizeof(*outputOnCuda))); 775 776 Vector<OUTPUT_SIZE> output; 777 778 // 为了简单,只支持一个BLOCK 779 #define N_BLOCK 1 780 // 可以多个THREAD 781 #define N_THREAD 1024 782 #define SHARED_SIZE (sizeof(float) * N_THREAD) 783 784 for(int step = 0;step < 100000;step++) { 785 clock_t step_1 = clock(); 786 787 float avgError = 0; 788 789 for(int i = 0;i < M;i++) { 790 Vector<INPUT_SIZE> &x = trainDataOnGpu[i]; 791 Vector<OUTPUT_SIZE> &y = trainLabels2OnGpu[i]; 792 793 forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda); 794 795 CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost)); 796 797 Vector<OUTPUT_SIZE> &hostY = trainLabels2[i]; 798 float error = CostFun::calc_host(output, hostY); 799 avgError += error; 800 //cout << output << " " << hostY << endl; 801 802 backward<CostFun><<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, y, 0.001); 803 } 804 avgError /= M; 805 806 clock_t step_2 = clock(); 807 808 cout << "step=" << step << " time_cost=" << (step_2 - step_1)*1.0/CLOCKS_PER_SEC << " avgErr=" << avgError << " "; 809 810 // validate 811 int nTotal = 0; 812 int nGood = 0; 813 for(int i = 0;i < M;i++) { 814 Vector<INPUT_SIZE> &x = trainDataOnGpu[i]; 815 816 forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda); 817 818 CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost)); 819 820 int maxIdx = getMaxIdx(output); 821 if(maxIdx == trainLabels[i]) { 822 nGood++; 823 } 824 nTotal++; 825 } 826 cout << "train_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " "; 827 bool doBreak = false; 828 if(nGood * 1.0 / nTotal > 0.95) { 829 doBreak = true; 830 } 831 832 // check 833 nTotal = 0; 834 nGood = 0; 835 for(int i = 0;i < T;i++) { 836 Vector<INPUT_SIZE> &x = testDataOnGpu[i]; 837 838 forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda); 839 840 CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost)); 841 842 int maxIdx = getMaxIdx(output); 843 if(maxIdx == testLabels[i]) { 844 nGood++; 845 } 846 nTotal++; 847 } 848 cout << "test_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " "; 849 850 clock_t step_3 = clock(); 851 cout << "total_time=" << (step_3-step_1)*1.0/CLOCKS_PER_SEC << endl; 852 if(doBreak) { 853 break; 854 } 855 } 856 857 clock_t fullEndedAt = clock(); 858 cout << "Total cost " << (fullEndedAt - fullStartedAt)/CLOCKS_PER_SEC << " seconds" << endl; 859 860 return 0; 861 }
以上是关于简单直接的CUDA改造的主要内容,如果未能解决你的问题,请参考以下文章
RuntimeError: ‘lengths’ argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor(代码片段