简单直接的CUDA改造

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了简单直接的CUDA改造相关的知识,希望对你有一定的参考价值。

把前一篇中的MNIST数据识别程序进行了简单的CUDA改造,得到的结果很差,一个epoch从大约5秒变成了50秒。

也可以理解,我把每个操作(比如mul, add)单独拎出来实现,结果由于不知道自己的上下文,就不要不断的 __syncthreads。

不过还是有一些收获,第一次写CUDA程序,最终还是基本保证了程序的正确性。

 

// 修正: 5秒和50秒的对比有错,因为两个网络的结构不一样,把之前的网络改成和CUDA程序一样的网络之后,时间变成了 30 vs 50,CUDA还是慢一些。

 

  1 #include <iostream>
  2 #include <cstdlib>
  3 #include <cassert>
  4 #include <string>
  5 #include <cstring>
  6 #include <fstream>
  7 #include <vector>
  8 #include <memory>
  9 #include <cstdlib>
 10 #include <cuda_runtime.h>
 11 #include <math_functions.h>
 12 #include <cmath>
 13 #include <ctime>
 14 using namespace std;
 15 
 16 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo)
 17 {
 18     if(code == cudaSuccess) return;
 19     cerr << "Cuda call failed at " << fileName << ":" << lineNo 
 20         << " " << cudaGetErrorString(code) << endl;
 21     exit(-1);
 22 }
 23 
 24 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__)
 25 
 26 // 为了简单,只用一个BLOCK
 27 #define BSIZE (blockDim.x)
 28 #define TIDX (threadIdx.x)
 29 
 30 bool InitCUDA()
 31 {
 32     int count;
 33     cudaGetDeviceCount(&count);
 34     if(count == 0) {
 35         cerr << "There is no cuda device" << endl;
 36         return false;
 37     }
 38     cout << "Toal " << count << " cuda devices" << endl;
 39 
 40     int i;
 41     for(i = 0;i < count;i++) {
 42         cudaDeviceProp prop;
 43         if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
 44             if(prop.major >= 1) {
 45                 break;
 46             }
 47         }
 48     }
 49 
 50     if(i == count) {
 51         cerr << "There is no device supporting CUDA 1.x" << endl;
 52         return false;
 53     }
 54 
 55     cudaSetDevice(i);
 56     return true;
 57 }
 58 
 59 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html
 60 // 高斯分布的随机数,均值为0,方差为1
 61 double gaussrand()
 62 {
 63     static double V1, V2, S;
 64     static int phase = 0;
 65     double X;
 66      
 67     if ( phase == 0 ) {
 68         do {
 69             double U1 = (double)rand() / RAND_MAX;
 70             double U2 = (double)rand() / RAND_MAX;
 71              
 72             V1 = 2 * U1 - 1;
 73             V2 = 2 * U2 - 1;
 74             S = V1 * V1 + V2 * V2;
 75         } while(S >= 1 || S == 0);
 76          
 77         X = V1 * sqrt(-2 * log(S) / S);
 78     } else
 79         X = V2 * sqrt(-2 * log(S) / S);
 80          
 81     phase = 1 - phase;
 82  
 83     return X;
 84 }
 85 
 86 #define ALIGN_FLOAT(x) (((x) + 3) & (~3))
 87 
 88 template<size_t ROW, size_t COL>
 89 struct Matrix
 90 {
 91     const static int row = ROW;
 92     const static int col = COL;
 93     float data[ROW][ALIGN_FLOAT(COL)];
 94     __device__ __host__ inline float* operator[](size_t x)
 95     {
 96         assert(x < ROW);
 97         return data[x];
 98     }
 99 };
100 
101 template<size_t SIZE>
102 struct Vector
103 {
104     const static int size = SIZE;
105     float data[SIZE];
106     __device__ __host__ inline float &operator[](size_t x)
107     {
108         assert(x < SIZE);
109         return data[x];
110     }
111 };
112 
113 template<size_t SIZE>
114 ostream& operator<<(ostream& out, Vector<SIZE>& v)
115 {
116     out << "[(" << v.size << ") ";
117     for(int i = 0;i < v.size;i++) {
118         if(i > 0) out << ",";
119         out << v[i];
120     }
121     out << "]";
122     return out;
123 }
124 
125 // 矩阵乘法
126 template<size_t ROW, size_t COL>
127 __device__ inline void mul(Matrix<ROW,COL> &w, Vector<COL> &x, Vector<ROW> &out)
128 {
129     /*
130     for(int i = 0;i < w.row;i++) {
131         out[i] = 0;
132         for(int j = 0;j < w.col;j++) {
133             out[i] += w[i][j] * x[j];
134         }
135     }
136     */
137     for(int i = TIDX;i < w.row;i += BSIZE) {
138         out[i] = 0;
139         for(int j = 0;j < w.col;j++) {
140             out[i] += w[i][j] * x[j];
141         }
142     }
143     __syncthreads();
144 }
145 
146 // 向量点乘
147 template<size_t SIZE>
148 __device__ inline void dot(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)
149 {
150     /*
151     for(int i = 0;i < x.size;i++) {
152         out[i] = x[i] * y[i];
153     }
154     */
155     for(int i = TIDX;i < x.size;i += BSIZE) {
156         out[i] = x[i] * y[i];
157     }
158     __syncthreads();
159 }
160 
161 // w转置,然后和v相乘
162 template<size_t ROW, size_t COL>
163 __device__ inline void t_and_mul(Matrix<ROW, COL> &w, Vector<ROW> &v, Vector<COL> &out)
164 {
165     /*
166     for(int i = 0;i < w.col;i++) {
167         out[i] = 0;
168         for(int j = 0;j < w.row;j++) {
169             out[i] += w[j][i] * v[j];
170         }
171     }
172     */
173     for(int i = TIDX;i < w.col;i++) {
174         out[i] = 0;
175         for(int j = 0;j < w.row;j++) {
176             out[i] += w[j][i] * v[j];
177         }
178     }
179     __syncthreads();
180 }
181 
182 template<size_t SIZE>
183 __device__ inline void add(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)
184 {
185     /*
186     for(int i = 0;i < x.size;i++) {
187         out[i] = x[i] + y[i];
188     }
189     */
190     for(int i = TIDX;i < x.size;i += BSIZE) {
191         out[i] = x[i] + y[i];
192     }
193     __syncthreads();
194 }
195 
196 template<size_t SIZE>
197 __device__ inline void sub(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)
198 {
199     /*
200     for(int i = 0;i < x.size;i++) {
201         out[i] = x[i] - y[i];
202     }
203     */
204     for(int i = TIDX;i < x.size;i += BSIZE) {
205         out[i] = x[i] - y[i];
206     }
207     __syncthreads();
208 }
209 
210 template<size_t SIZE>
211 __device__ inline void mul(float x, Vector<SIZE> &y, Vector<SIZE> &out)
212 {
213     /*
214     for(int i = 0;i < y.size;i++) {
215         out[i] = x * y[i];
216     }
217     */
218     for(int i = TIDX;i < y.size;i += BSIZE) {
219         out[i] = x * y[i];
220     }
221     __syncthreads();
222 }
223 
224 template<size_t SIZE>
225 __device__ inline void mul(Vector<SIZE> &x, float y, Vector<SIZE> &out)
226 {
227     mul(y, x, out);
228 }
229 
230 template<size_t SIZE>
231 __device__ inline void copy(Vector<SIZE> &x, Vector<SIZE> &out)
232 {
233     /*
234     for(int i = 0;i < x.size;i++) {
235         out[i] = x[i];
236     }
237     */
238     for(int i = TIDX;i < x.size;i += BSIZE) {
239         out[i] = x[i];
240     }
241     __syncthreads();
242 }
243 
244 __device__ inline float sum_of_shared(int size)
245 {
246     extern __shared__ float shared[];
247 
248     __syncthreads();
249 
250     if(TIDX == 0) {
251         for(int i = 1;i < size;i++) {
252             shared[0] += shared[i];
253         }
254     }
255     __syncthreads();
256 
257     float ret = shared[0];
258 
259     __syncthreads();
260 
261     return ret;
262 }
263 
264 __device__ inline float max_of_shared(int size)
265 {
266     extern __shared__ float shared[];
267 
268     __syncthreads();
269 
270     if(TIDX == 0) {
271         for(int i = 1;i < size;i++) {
272             if(shared[0] < shared[i]) {
273                 shared[0] = shared[i];
274             }
275         }
276     }
277     __syncthreads();
278 
279     float ret = shared[0];
280 
281     __syncthreads();
282 
283     return ret;
284 }
285 
286 
287 template<size_t SIZE>
288 __device__ inline float max(Vector<SIZE>& x)
289 {
290     assert(x.size > 0);
291 
292     extern __shared__ float shared[];
293 
294     if(TIDX < x.size) {
295         shared[TIDX] = x[TIDX];
296     }
297 
298     for(int i = TIDX + BSIZE;i < x.size;i += BSIZE) {
299         if(shared[TIDX] < x[i]) {
300             shared[TIDX] = x[i];
301         }
302     }
303 
304     return max_of_shared(min(BSIZE, x.size));
305 }
306 
307 template<size_t SIZE>
308 __device__ inline float sum(Vector<SIZE>& x)
309 {
310     assert(x.size > 0);
311 
312     extern __shared__ float shared[];
313 
314     if(TIDX < x.size) {
315         shared[TIDX] = 0;
316     }
317 
318     for(int i = TIDX;i < x.size;i += BSIZE) {
319         shared[TIDX] += x[i];
320     }
321 
322     return sum_of_shared(min(BSIZE, x.size));
323 }
324 
325 template<size_t SIZE>
326 __device__ inline void add_with_xs(Vector<SIZE> &x, float xs, Vector<SIZE> &y, Vector<SIZE> &out)
327 {
328     /*
329     for(int i = 0;i < x.size;i++) {
330         out[i] = x[i] + xs * y[i];
331     }
332     */
333     for(int i = TIDX;i < x.size;i += BSIZE) {
334         out[i] = x[i] + xs * y[i];
335     }
336 
337     __syncthreads();
338 }
339 
340 template<size_t SIZE>
341 struct Softmax
342 {
343     __device__ static inline float calc(Vector<SIZE>& x, Vector<SIZE>& y)
344     {
345         // - \\sum y_j * log( exp(x_j) / \\sum exp(x_k) )
346         /*
347           log( exp(x_j) / \\sum exp(x_k) )
348         = x_j - log \\sum exp(x_k)
349         = x_j - (max + log \\sum exp(x_k - max))
350         */
351 
352         float maxX = max(x);
353 
354         /*
355         float xSum = 0;
356         for(int i = 0;i < x.size;i++) {
357             xSum += expf(x[i] - maxX);
358         }
359         */
360 
361         extern __shared__ float shared[];
362         if(TIDX < x.size) {
363             shared[TIDX] = 0;
364         }
365 
366         for(int i = TIDX;i < x.size;i += BSIZE) {
367             shared[TIDX] += expf(x[i] - maxX);
368         }
369         if(TIDX == 0) {
370             for(int i = 1;i < BSIZE && i < x.size;i++) {
371                 shared[0] += shared[i];
372             }
373         }
374 
375         float xSum = sum_of_shared(min(BSIZE, x.size));
376 
377         /*
378         float ret = 0;
379         for(int i = 0;i < x.size;i++) {
380             ret += y[i] * (x[i] - (maxX + logf(xSum)));
381         }
382         */
383         if(TIDX < x.size) {
384             shared[TIDX] = 0;
385         }
386         for(int i = TIDX;i < x.size;i += BSIZE) {
387             shared[i] += y[i] * (x[i] - (maxX + logf(xSum)));
388         }
389 
390         float ret = sum_of_shared(min(BSIZE, x.size));
391 
392         return -ret;
393     }
394 
395 
396     static inline float calc_host(Vector<SIZE>& x, Vector<SIZE>& y)
397     {
398         // - \\sum y_j * log( exp(x_j) / \\sum exp(x_k) )
399         /*
400           log( exp(x_j) / \\sum exp(x_k) )
401         = x_j - log \\sum exp(x_k)
402         = x_j - (max + log \\sum exp(x_k - max))
403         */
404 
405         float maxX = x[0];
406         for(int i = 1;i < x.size;i++) {
407             if(x[i] > maxX) {
408                 maxX = x[i];
409             }
410         }
411 
412         float xSum = 0;
413         for(int i = 0;i < x.size;i++) {
414             xSum += expf(x[i] - maxX);
415         }
416 
417         float ret = 0;
418         for(int i = 0;i < x.size;i++) {
419             ret += y[i] * (x[i] - (maxX + logf(xSum)));
420         }
421 
422         return -ret;
423     }
424 
425     __device__ static inline void propagate_delta(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)
426     {
427         /*
428           - d \\sum y_j * log( exp(x_j) / \\sum exp(x_k) )
429         = - d \\sum y_j * x_j - d \\sum y_j log (\\sum exp(x_k) )
430         = - y_i + \\sum (y_j * exp(x_i) / \\sum exp(x_k))
431         = - y_i + exp(x_i) (\\sum y_j) / (\\sum exp(x_k))
432         */
433 
434         float maxX = max(x);
435 
436         // -y + exp(x) sum_of_y / sum_of_exp(x)
437 
438         /*
439         float sumOfY = 0;
440         float sumOfX = 0;
441         
442         for(int i = 0;i < x.size;i++) {
443             out[i] = expf(x[i] - maxX);
444             sumOfY += y[i];
445             sumOfX += out[i];
446         }
447         */
448 
449         for(int i = TIDX;i < x.size;i += BSIZE) {
450             out[i] = expf(x[i] - maxX);
451         }
452 
453         float sumOfY = sum(y);
454         float sumOfX = sum(out);
455 
456         float t = sumOfY/sumOfX;
457 
458         mul(t, out, out);
459         sub(out, y, out);
460     }
461 };
462 
463 template<size_t SIZE>
464 struct Relu
465 {
466     __device__ static inline void forward(Vector<SIZE> &x, Vector<SIZE> &out)
467     {
468         /*
469         for(int i = 0;i < x.size;i++) {
470             out[i] = x[i] >= 0 ? x[i] : 0;
471         }
472         */
473         for(int i = TIDX;i < x.size;i += BSIZE) {
474             out[i] = x[i] >= 0 ? x[i] : 0;
475         }
476         __syncthreads();
477     }
478 
479     __device__ static inline void derive(Vector<SIZE> &x, Vector<SIZE> &out)
480     {
481         /*
482         for(int i = 0;i < x.size;i++) {
483             out[i] = x[i] >= 0 ? 1 : 0;
484         }
485         */
486         for(int i = TIDX;i < x.size;i += BSIZE) {
487             out[i] = x[i] >= 0 ? 1 : 0;
488         }
489         __syncthreads();
490     }
491 
492     __device__ static inline void derive_and_dot_into(Vector<SIZE> &x, Vector<SIZE> &out)
493     {
494         // out = dot(dx, out)
495         /*
496         for(int i = 0;i < x.size;i++) {
497             out[i] = out[i] * (x[i] >= 0 ? 1 : 0);
498         }
499         */
500         for(int i = TIDX;i < x.size;i += BSIZE) {
501             out[i] = out[i] * (x[i] >= 0 ? 1 : 0);
502         }
503         __syncthreads();
504     }
505 };
506 
507 // NN的一层
508 // 1. 输入不算一层
509 // 2. 层的w矩阵是从前面一层到当前层的w,和NG的定义有些出入
510 // 3. 层的b是前面一层到当前层的b,和NG的定义有些出入
511 template <size_t IN_SIZE, size_t OUT_SIZE, typename Activator=Relu<OUT_SIZE> >
512 struct Layer
513 {
514     typedef Activator ActivatorType;
515 
516     // 上一层的输出的个数,不包括bias
517     const static int inSize = IN_SIZE;
518     // 当前层的输出
519     const static int outSize = OUT_SIZE;
520 
521     Matrix<OUT_SIZE, IN_SIZE> w;
522     Vector<OUT_SIZE> b;
523     // 最后一次forward计算之后保存的激活值
524     Vector<OUT_SIZE> a;
525     Vector<OUT_SIZE> z;
526     // 最后一次反向传播计算之后保存的delta值
527     Vector<OUT_SIZE> delta;
528 
529     void initWeights()
530     {
531         for(int i = 0;i < b.size;i++) {
532             b[i] = float(gaussrand() * 0.01);
533         }
534         for(int i = 0;i < w.row;i++) {
535             for(int j = 0;j < w.col;j++) {
536                 w[i][j] = float(gaussrand() * 0.01);
537             }
538         }
539     }
540 
541     Layer()
542     {
543         initWeights();
544     }
545 
546     __device__ inline void calc(Vector<IN_SIZE> &in)
547     {
548         // w * in + b
549         mul(w, in, z);
550         add(z, b, z);
551         Activator::forward(z, a);
552     }
553 
554     __device__ inline void propagate_delta(Vector<IN_SIZE> &out)
555     {
556         t_and_mul(w, delta, out);
557     }
558 
559     // alpha是学习率
560     // prevA是上一层的输出
561     __device__ inline void update_parameters(float alpha, Vector <IN_SIZE> &prevA)
562     {
563         // b = b - alpha * delta
564         add_with_xs(b, -alpha, delta, b);
565 
566         /*
567         for(int i = 0;i < w.row;i++) {
568             for(int j = 0;j < w.col;j++) {
569                 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i];
570             }
571         }
572         */
573         for(int i = TIDX;i < w.row;i += BSIZE) {
574             for(int j = 0;j < w.col;j++) {
575                 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i];
576             }
577         }
578         __syncthreads();
579     }
580 };
581 
582 // model
583 #define INPUT_SIZE (28 * 28)
584 #define OUTPUT_SIZE 10
585 
586 typedef Layer<INPUT_SIZE, 100, Relu<100> > L1_TYPE;
587 
588 typedef Layer<100, OUTPUT_SIZE, Relu<OUTPUT_SIZE> > L2_TYPE;
589 
590 #define LN_TYPE L2_TYPE
591 #define LN L2
592 
593 __global__ void forward(
594     L1_TYPE &L1, L2_TYPE &L2,
595     Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &output)
596 {
597     L1.calc(input);
598 
599     L2.calc(L1.a);
600 
601     copy(L2.a, output);
602 }
603 
604 template<typename CostFun>
605 __global__ void backward(
606     L1_TYPE &L1, L2_TYPE &L2,
607     Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &y, float alpha)
608 {
609     // 最后一层
610     CostFun::propagate_delta(LN.a, y, LN.delta);
611     LN_TYPE::ActivatorType::derive_and_dot_into(LN.a, LN.delta);
612 
613     // 其它层
614     L2.propagate_delta(L1.delta);
615     L1_TYPE::ActivatorType::derive_and_dot_into(L1.a, L1.delta);
616 
617     // 更新所有的w和b
618     L1.update_parameters(alpha, input);
619     L2.update_parameters(alpha, L1.a);
620 }
621 
622 // 读取一个整数
623 int MsbInt(char buf[], int len=4)
624 {
625     int base = 1;
626     int ret = 0;
627     for(int i = len - 1;i >= 0;i--) {
628         ret += (unsigned char)buf[i] * base;
629         base *= 256;
630     }
631     return ret;
632 }
633 
634 vector<int> ReadMnistLabels(string fileName)
635 {
636     vector<int> ret;
637     ifstream ifs(fileName.c_str(), ios::binary);
638     char buf[1000];
639 
640     // MAGIC
641     ifs.read(buf, 4);
642     int magic = MsbInt(buf);
643     if(magic != 0x00000801) {
644         cerr << "incorrect label file magic number" << endl;
645     }
646 
647     // num of images
648     ifs.read(buf, 4);
649     int nImages = MsbInt(buf);
650 
651     while(nImages--) {
652         ret.push_back(ifs.get());
653     }
654 
655     return ret;
656 }
657 
658 Vector<INPUT_SIZE> * ReadMnistData(string fileName)
659 {
660     ifstream ifs(fileName.c_str(), ios::binary);
661     char buf[1000];
662 
663     // MAGIC
664     ifs.read(buf, 4);
665     int magic = MsbInt(buf);
666     if(magic != 0x00000803) {
667         cerr << "incorrect data file magic number" << endl;
668     }
669 
670     // num of images
671     ifs.read(buf, 4);
672     int nImages = MsbInt(buf);
673     Vector<INPUT_SIZE> * ret = new Vector<INPUT_SIZE>[nImages];
674 
675     int row, col;
676     ifs.read(buf, 4);
677     row = MsbInt(buf);
678     ifs.read(buf, 4);
679     col = MsbInt(buf);
680     if(row * col != INPUT_SIZE) {
681         cerr << "incorrect image size" << endl;
682     }
683 
684     for(int k = 0;k < nImages;k++) {
685         Vector<INPUT_SIZE> &image = ret[k];
686         for(int i = 0;i < row * col;i++) {
687             image[i] = ifs.get() / 256.0; // 归一化
688         }
689     }
690 
691     return ret;
692 }
693 
694 Vector<OUTPUT_SIZE>* translateLabels(vector<int> &labels, int k=10)
695 {
696     int n = labels.size();
697     Vector<OUTPUT_SIZE> * ret = new Vector<OUTPUT_SIZE>[n];
698 
699     for(int i = 0;i < labels.size();i++) {
700         Vector<OUTPUT_SIZE> &tmp = ret[i];
701         memset(&tmp, 0, sizeof(tmp));
702         assert(labels[i] >= 0 && labels[i] < k);
703         tmp[labels[i]] = 1;
704     }
705     return ret;
706 }
707 
708 int getMaxIdx(Vector<OUTPUT_SIZE>& x)
709 {
710     int maxIdx = 0;
711     float maxV = x[0];
712     for(int i = 0;i < x.size;i++) {
713         if(x[i] > maxV) {
714             maxV = x[i];
715             maxIdx = i;
716         }
717     }
718     return maxIdx;
719 }
720 
721 template <typename T>
722 void CUDA_ALLOC_AND_COPY(T *&to, T *from, size_t size)
723 {
724     CK(cudaMalloc((void**)&to, size));
725     CK(cudaMemcpy(to, from, size, cudaMemcpyHostToDevice));
726 }
727 
728 int main()
729 {
730     srand(1000);
731 
732     if(!InitCUDA()) {
733         return -1;
734     }
735 
736     L1_TYPE *tmpL1 = new L1_TYPE(), *L1;
737     CUDA_ALLOC_AND_COPY(L1, tmpL1, sizeof(*L1));
738     delete tmpL1;
739 
740     L2_TYPE *tmpL2 = new L2_TYPE(), *L2;
741     CUDA_ALLOC_AND_COPY(L2, tmpL2, sizeof(*L2));
742     delete tmpL2;
743 
744     cout << "Loading data" << endl;
745 
746     // 读取数据
747     vector<int> trainLabels = ReadMnistLabels("mnist/train-labels-idx1-ubyte");
748     int nTrain = trainLabels.size();
749     Vector<OUTPUT_SIZE>* trainLabels2 = translateLabels(trainLabels);
750     Vector<OUTPUT_SIZE>* trainLabels2OnGpu;
751     CUDA_ALLOC_AND_COPY(trainLabels2OnGpu, trainLabels2, sizeof(trainLabels2[0]) * nTrain);
752 
753     Vector<INPUT_SIZE>* trainData = ReadMnistData("mnist/train-images-idx3-ubyte");
754     Vector<INPUT_SIZE>* trainDataOnGpu;
755     CUDA_ALLOC_AND_COPY(trainDataOnGpu, trainData, sizeof(trainData[0]) * nTrain);
756 
757     vector<int> testLabels = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");
758     int nTest = testLabels.size();
759 
760     Vector<INPUT_SIZE>* testData = ReadMnistData("mnist/t10k-images-idx3-ubyte");
761     Vector<INPUT_SIZE>* testDataOnGpu;
762     CUDA_ALLOC_AND_COPY(testDataOnGpu, testData, sizeof(testData[0]) * nTest);
763 
764     int M = nTrain;
765     int T = nTest;
766 
767     typedef Softmax<OUTPUT_SIZE> CostFun;
768 
769     // 开始训练
770     cout << "Start training" << endl;
771     clock_t fullStartedAt = clock();
772 
773     Vector<OUTPUT_SIZE> *outputOnCuda;
774     CK(cudaMalloc((void**)&outputOnCuda, sizeof(*outputOnCuda)));
775 
776     Vector<OUTPUT_SIZE> output;
777 
778     // 为了简单,只支持一个BLOCK
779     #define N_BLOCK 1
780     // 可以多个THREAD
781     #define N_THREAD 1024
782     #define SHARED_SIZE (sizeof(float) * N_THREAD)
783 
784     for(int step = 0;step < 100000;step++) {
785         clock_t step_1 = clock();
786 
787         float avgError = 0;
788 
789         for(int i = 0;i < M;i++) {
790             Vector<INPUT_SIZE> &x = trainDataOnGpu[i];
791             Vector<OUTPUT_SIZE> &y = trainLabels2OnGpu[i];
792 
793             forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);
794 
795             CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));
796 
797             Vector<OUTPUT_SIZE> &hostY = trainLabels2[i];
798             float error = CostFun::calc_host(output, hostY);
799             avgError += error;
800             //cout << output << " " << hostY << endl;
801 
802             backward<CostFun><<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, y, 0.001);
803         }
804         avgError /= M;
805 
806         clock_t step_2 = clock();
807 
808         cout << "step=" << step << " time_cost=" << (step_2 - step_1)*1.0/CLOCKS_PER_SEC << " avgErr=" << avgError << " ";
809 
810         // validate
811         int nTotal = 0;
812         int nGood = 0;
813         for(int i = 0;i < M;i++) {
814             Vector<INPUT_SIZE> &x = trainDataOnGpu[i];
815 
816             forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);
817 
818             CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));
819 
820             int maxIdx = getMaxIdx(output);
821             if(maxIdx == trainLabels[i]) {
822                 nGood++;
823             }
824             nTotal++;
825         }
826         cout << "train_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";
827         bool doBreak = false;
828         if(nGood * 1.0 / nTotal > 0.95) {
829             doBreak = true;
830         }
831 
832         // check
833         nTotal = 0;
834         nGood = 0;
835         for(int i = 0;i < T;i++) {
836             Vector<INPUT_SIZE> &x = testDataOnGpu[i];
837 
838             forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);
839 
840             CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));
841 
842             int maxIdx = getMaxIdx(output);
843             if(maxIdx == testLabels[i]) {
844                 nGood++;
845             }
846             nTotal++;
847         }
848         cout << "test_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";
849 
850         clock_t step_3 = clock();
851         cout << "total_time=" << (step_3-step_1)*1.0/CLOCKS_PER_SEC << endl;
852         if(doBreak) {
853             break;
854         }
855     }
856 
857     clock_t fullEndedAt = clock();
858     cout << "Total cost " << (fullEndedAt - fullStartedAt)/CLOCKS_PER_SEC << " seconds" << endl;
859 
860     return 0;
861 }

 

以上是关于简单直接的CUDA改造的主要内容,如果未能解决你的问题,请参考以下文章

RuntimeError: ‘lengths’ argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor(代码片段

python常用代码

在 VS2010 中使用 Nvidia NSight 进行 CUDA 性能分析 - 时间线上的片段

如何从片段中的 JSON 响应中的对象获取数据

BZOJ4641基因改造 KMP

如何从活动 CUDA 设备获取属性?