linux利用CMakeLists编译cuda程序

Posted BlueOceans

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了linux利用CMakeLists编译cuda程序相关的知识,希望对你有一定的参考价值。

文件目录:

cudaTest

    |--utils.cu

    |--utils.h

    |--squaresum.cu

    |--squaresum.h

    |--test.cpp

    |--CMakeLists.txt

编译命令:

$cd /root/cudaTest

$mkdir build

$cd build

$cmake ..

$make

调佣关系:

utils:提供常用工具,这里提供查询设备信息功能;

squaresum:计算平方和功能,为cuda运行的核心函数实现

test:调用平方和函数

CMakeLists.txt:组织所有文件编译生成可执行文件

注意:调用cu文件中的函数时要在头文件声明成extern “C”

文件内容:

CMakeLists.txt

# CMakeLists.txt to build hellocuda.cu
cmake_minimum_required(VERSION 2.8)
find_package(CUDA QUIET REQUIRED)
 
# Specify binary name and source file to build it from
#add_library(utils utils.cpp)
cuda_add_executable(
    squaresum
    test.cpp squaresum.cu utils.cu)
#target_link_libraries(squaresum utils)

test.cpp

#include <iostream>
#include "squaresum.h"

//extern "C" int squaresum();

int main(){
  squaresum();
  return 0;
}

squaresum.h

#include "utils.h"
#include <cuda_runtime.h>

extern "C" {
  int squaresum();
}

squaresum.cu

#include <stdio.h>
#include <stdlib.h>
//#include "utils.h"
#include <iostream>
#include "squaresum.h"
// ======== define area ========
#define DATA_SIZE 1048576 // 1M

// ======== global area ========
int data[DATA_SIZE];

__global__ static void squaresSum(int *data, int *sum, clock_t *time)
{
 int sum_t = 0;
 clock_t start = clock();
 for (int i = 0; i < DATA_SIZE; ++i) {
  sum_t += data[i] * data[i];
 }
 *sum = sum_t;
 *time = clock() - start;
}

// ======== used to generate rand datas ========
void generateData(int *data, int size)
{
 for (int i = 0; i < size; ++i) {
  data[i] = rand() % 10;
 }
}

int squaresum()
{
 // init CUDA device
 if (!InitCUDA()) {
  return 0;
 }
 printf("CUDA initialized.\n");

 // generate rand datas
 generateData(data, DATA_SIZE);

 // malloc space for datas in GPU
 int *gpuData, *sum;
 clock_t *time;
 cudaMalloc((void**) &gpuData, sizeof(int) * DATA_SIZE);
 cudaMalloc((void**) &sum, sizeof(int));
 cudaMalloc((void**) &time, sizeof(clock_t));
 cudaMemcpy(gpuData, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);

 // calculate the squares‘s sum
 squaresSum<<<1, 1, 0>>>(gpuData, sum, time);

 // copy the result from GPU to HOST
 int result;
 clock_t time_used;
 cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost);
 cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

 // free GPU spaces
 cudaFree(gpuData);
 cudaFree(sum);
 cudaFree(time);

 // print result
 printf("(GPU) sum:%d time:%ld\n", result, time_used);

 // CPU calculate
 result = 0;
 clock_t start = clock();
 for (int i = 0; i < DATA_SIZE; ++i) {
  result += data[i] * data[i];
 }
 time_used = clock() - start;
 printf("(CPU) sum:%d time:%ld\n", result, time_used);

 return 0;
}

utils.h

#include <stdio.h>
#include <cuda_runtime.h>

extern "C" {
  bool InitCUDA();
}

utils.cu

#include "utils.h"
#include <cuda_runtime.h>
#include <iostream>

void printDeviceProp(const cudaDeviceProp &prop)
{
 printf("Device Name : %s.\n", prop.name);
 printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
 printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
 printf("regsPerBlock : %d.\n", prop.regsPerBlock);
 printf("warpSize : %d.\n", prop.warpSize);
 printf("memPitch : %d.\n", prop.memPitch);
 printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
 printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
 printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
 printf("totalConstMem : %d.\n", prop.totalConstMem);
 printf("major.minor : %d.%d.\n", prop.major, prop.minor);
 printf("clockRate : %d.\n", prop.clockRate);
 printf("textureAlignment : %d.\n", prop.textureAlignment);
 printf("deviceOverlap : %d.\n", prop.deviceOverlap);
 printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
 //used to count the device numbers
 int count; 

 // get the cuda device count
 cudaGetDeviceCount(&count);
// print("%d\n", count);
std::cout << count << std::endl;
 if (count == 0) {
  fprintf(stderr, "There is no device.\n");
  return false;
 }

 // find the device >= 1.X
 int i;
 for (i = 0; i < count; ++i) {
  cudaDeviceProp prop;
  if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
   if (prop.major >= 1) {
    printDeviceProp(prop);
    break;
   }
  }
 }

 // if can‘t find the device
 if (i == count) {
  fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
  return false;
 }

 // set cuda device
 cudaSetDevice(i);

 return true;
}

//int main(){
//  InitCUDA();
//}

 

以上是关于linux利用CMakeLists编译cuda程序的主要内容,如果未能解决你的问题,请参考以下文章

CMake 3.x + CUDA - 编译失败

使用CMake编译C/C++程序

编译依赖ndt_gpu库的包,遇到Eigen报错

使用C ++ 11时CUDA nvcc编译器失败(Linux; clang 3.8)

CUDA 中的编译时信息

即使在构建可执行文件时,CMake 3.0 + Fortran + CUDA也需要-fPIC