pow
方法内部用于计算菲涅耳效果的trace
调用,错误就会消失。目前,我只是将pow(var, 3)
替换为var*var*var
。
[我将尝试创建MVCE以便最终向NVIDIA提交错误报告...
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了NVCC和NVRTC在编译为PTX时的差异相关的知识,希望对你有一定的参考价值。
我正在将基于Scratchapixel version的简单光线跟踪应用程序移植到一堆GPU库中。我已成功使用运行时API和驱动程序API将其移植到CUDA,但是当我尝试将运行时编译的PTX与NVRTC一起使用时,会抛出Segmentation fault (core dumped)
。如果我在内核文件的开头(请参见下文)取消注释#include <math.h>
指令,它仍然可以在NVCC中使用(生成的PTX完全相同),但在使用NVRTC编译时将失败。
我想知道如何使NVRTC像NVCC一样运转(甚至有可能?),或者至少要了解此问题背后的原因。
文件kernel.cu
(内核源::)>
//#include <math.h> #define MAX_RAY_DEPTH 5 template<typename T> class Vec3 { public: T x, y, z; __device__ Vec3() : x(T(0)), y(T(0)), z(T(0)) {} __device__ Vec3(T xx) : x(xx), y(xx), z(xx) {} __device__ Vec3(T xx, T yy, T zz) : x(xx), y(yy), z(zz) {} __device__ Vec3& normalize() { T nor2 = length2(); if (nor2 > 0) { T invNor = 1 / sqrt(nor2); x *= invNor, y *= invNor, z *= invNor; } return *this; } __device__ Vec3<T> operator * (const T &f) const { return Vec3<T>(x * f, y * f, z * f); } __device__ Vec3<T> operator * (const Vec3<T> &v) const { return Vec3<T>(x * v.x, y * v.y, z * v.z); } __device__ T dot(const Vec3<T> &v) const { return x * v.x + y * v.y + z * v.z; } __device__ Vec3<T> operator - (const Vec3<T> &v) const { return Vec3<T>(x - v.x, y - v.y, z - v.z); } __device__ Vec3<T> operator + (const Vec3<T> &v) const { return Vec3<T>(x + v.x, y + v.y, z + v.z); } __device__ Vec3<T>& operator += (const Vec3<T> &v) { x += v.x, y += v.y, z += v.z; return *this; } __device__ Vec3<T>& operator *= (const Vec3<T> &v) { x *= v.x, y *= v.y, z *= v.z; return *this; } __device__ Vec3<T> operator - () const { return Vec3<T>(-x, -y, -z); } __device__ T length2() const { return x * x + y * y + z * z; } __device__ T length() const { return sqrt(length2()); } }; typedef Vec3<float> Vec3f; typedef Vec3<bool> Vec3b; class Sphere { public: const char* id; Vec3f center; /// position of the sphere float radius, radius2; /// sphere radius and radius^2 Vec3f surfaceColor, emissionColor; /// surface color and emission (light) float transparency, reflection; /// surface transparency and reflectivity int animation_frame; Vec3b animation_position_rand; Vec3f animation_position; Sphere( const char* id, const Vec3f &c, const float &r, const Vec3f &sc, const float &refl = 0, const float &transp = 0, const Vec3f &ec = 0) : id(id), center(c), radius(r), radius2(r * r), surfaceColor(sc), emissionColor(ec), transparency(transp), reflection(refl) { animation_frame = 0; } //[comment] // Compute a ray-sphere intersection using the geometric solution //[/comment] __device__ bool intersect(const Vec3f &rayorig, const Vec3f &raydir, float &t0, float &t1) const { Vec3f l = center - rayorig; float tca = l.dot(raydir); if (tca < 0) return false; float d2 = l.dot(l) - tca * tca; if (d2 > radius2) return false; float thc = sqrt(radius2 - d2); t0 = tca - thc; t1 = tca + thc; return true; } }; __device__ float mix(const float &a, const float &b, const float &mixval) { return b * mixval + a * (1 - mixval); } __device__ Vec3f trace( const Vec3f &rayorig, const Vec3f &raydir, const Sphere *spheres, const unsigned int spheres_size, const int &depth) { float tnear = INFINITY; const Sphere* sphere = NULL; // find intersection of this ray with the sphere in the scene for (unsigned i = 0; i < spheres_size; ++i) { float t0 = INFINITY, t1 = INFINITY; if (spheres[i].intersect(rayorig, raydir, t0, t1)) { if (t0 < 0) t0 = t1; if (t0 < tnear) { tnear = t0; sphere = &spheres[i]; } } } // if there's no intersection return black or background color if (!sphere) return Vec3f(2); Vec3f surfaceColor = 0; // color of the ray/surfaceof the object intersected by the ray Vec3f phit = rayorig + raydir * tnear; // point of intersection Vec3f nhit = phit - sphere->center; // normal at the intersection point nhit.normalize(); // normalize normal direction // If the normal and the view direction are not opposite to each other // reverse the normal direction. That also means we are inside the sphere so set // the inside bool to true. Finally reverse the sign of IdotN which we want // positive. float bias = 1e-4; // add some bias to the point from which we will be tracing bool inside = false; if (raydir.dot(nhit) > 0) nhit = -nhit, inside = true; if ((sphere->transparency > 0 || sphere->reflection > 0) && depth < MAX_RAY_DEPTH) { float facingratio = -raydir.dot(nhit); // change the mix value to tweak the effect float fresneleffect = mix(pow(1 - facingratio, 3), 1, 0.1); // compute reflection direction (not need to normalize because all vectors // are already normalized) Vec3f refldir = raydir - nhit * 2 * raydir.dot(nhit); refldir.normalize(); Vec3f reflection = trace(phit + nhit * bias, refldir, spheres, spheres_size, depth + 1); Vec3f refraction = 0; // if the sphere is also transparent compute refraction ray (transmission) if (sphere->transparency) { float ior = 1.1, eta = (inside) ? ior : 1 / ior; // are we inside or outside the surface? float cosi = -nhit.dot(raydir); float k = 1 - eta * eta * (1 - cosi * cosi); Vec3f refrdir = raydir * eta + nhit * (eta * cosi - sqrt(k)); refrdir.normalize(); refraction = trace(phit - nhit * bias, refrdir, spheres, spheres_size, depth + 1); } // the result is a mix of reflection and refraction (if the sphere is transparent) surfaceColor = ( reflection * fresneleffect + refraction * (1 - fresneleffect) * sphere->transparency) * sphere->surfaceColor; } else { // it's a diffuse object, no need to raytrace any further for (unsigned i = 0; i < spheres_size; ++i) { if (spheres[i].emissionColor.x > 0) { // this is a light Vec3f transmission = 1; Vec3f lightDirection = spheres[i].center - phit; lightDirection.normalize(); for (unsigned j = 0; j < spheres_size; ++j) { if (i != j) { float t0, t1; if (spheres[j].intersect(phit + nhit * bias, lightDirection, t0, t1)) { transmission = 0; break; } } } surfaceColor += sphere->surfaceColor * transmission * max(float(0), nhit.dot(lightDirection)) * spheres[i].emissionColor; } } } return surfaceColor + sphere->emissionColor; } extern "C" __global__ void raytrace_kernel(unsigned int width, unsigned int height, Vec3f *image, Sphere *spheres, unsigned int spheres_size, float invWidth, float invHeight, float aspectratio, float angle) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y < height && x < width) { float xx = (2 * ((x + 0.5) * invWidth) - 1) * angle * aspectratio; float yy = (1 - 2 * ((y + 0.5) * invHeight)) * angle; Vec3f raydir(xx, yy, -1); raydir.normalize(); image[y*width+x] = trace(Vec3f(0), raydir, spheres, spheres_size, 0); } }
我可以使用:
nvcc --ptx kernel.cu -o kernel.ptx
(full PTX here)成功编译它,并使用以下代码段在cuModuleLoadDataEx
中将驱动器API使用该PTX。它按预期工作。
即使我取消注释#include <math.h>
行,它也能正常工作(实际上,生成的PTX完全相同)。
CudaSafeCall( cuInit(0) ); CUdevice device; CudaSafeCall( cuDeviceGet(&device, 0) ); CUcontext context; CudaSafeCall( cuCtxCreate(&context, 0, device) ); unsigned int error_buffer_size = 1024; std::vector<CUjit_option> options; std::vector<void*> values; char* error_log = new char[error_buffer_size]; options.push_back(CU_JIT_ERROR_LOG_BUFFER); //Pointer to a buffer in which to print any log messages that reflect errors values.push_back(error_log); options.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES); //Log buffer size in bytes. Log messages will be capped at this size (including null terminator) values.push_back(&error_buffer_size); options.push_back(CU_JIT_TARGET_FROM_CUCONTEXT); //Determines the target based on the current attached context (default) values.push_back(0); //No option value required for CU_JIT_TARGET_FROM_CUCONTEXT CUmodule module; CUresult status = cuModuleLoadDataEx(&module, ptxSource, options.size(), options.data(), values.data()); if (error_log && error_log[0]) { //https://stackoverflow.com/a/7970669/3136474 std::cout << "Compiler error: " << error_log << std::endl; } CudaSafeCall( status );
但是,每当我尝试使用NVRTC(full PTX here)编译此确切的内核时,它都能成功编译,但是在调用
Segmentation fault (core dumped)
时给了我一个cuModuleLoadDataEx
(尝试使用生成的PTX时。)>如果我取消注释
#include <math.h>
行,则它在nvrtcCompileProgram
调用中失败,并显示以下输出:
nvrtcSafeBuild() failed at cuda_raytracer_nvrtc_api.cpp:221 : NVRTC_ERROR_COMPILATION Build log: /usr/include/bits/mathcalls.h(177): error: linkage specification is incompatible with previous "isinf" __nv_nvrtc_builtin_header.h(126689): here /usr/include/bits/mathcalls.h(211): error: linkage specification is incompatible with previous "isnan" __nv_nvrtc_builtin_header.h(126686): here 2 errors detected in the compilation of "kernel.cu".
我用于NVRTC编译的代码是:
nvrtcProgram prog; NvrtcSafeCall( nvrtcCreateProgram(&prog, kernelSource, "kernel.cu", 0, NULL, NULL) ); // https://docs.nvidia.com/cuda/nvrtc/index.html#group__options std::vector<const char*> compilationOpts; compilationOpts.push_back("--device-as-default-execution-space"); // NvrtcSafeBuild is a macro which automatically prints nvrtcGetProgramLog if the compilation fails NvrtcSafeBuild( nvrtcCompileProgram(prog, compilationOpts.size(), compilationOpts.data()), prog ); size_t ptxSize; NvrtcSafeCall( nvrtcGetPTXSize(prog, &ptxSize) ); char* ptxSource = new char[ptxSize]; NvrtcSafeCall( nvrtcGetPTX(prog, ptxSource) ); NvrtcSafeCall( nvrtcDestroyProgram(&prog) );
然后,我使用前面的代码片段简单地加载
ptxSource
(注意:该代码块与驱动程序API版本和NVRTC版本都使用相同。)>到目前为止我已经注意到/尝试过的其他事项
- PTX generated by the NVCC和the one generated by NVRTC完全不同,但是我无法理解它们以识别可能的问题。
--ftz=false --prec-sqrt=true --prec-div=true --fmad=false
中的选项nvrtcCompileProgram
)。 PTX文件变大了,但仍然Segfaulting。--std=c++11
或--std=c++14
添加到NVRTC编译器选项。使用NVRTC中的任何一个,NVRTC都会生成几乎为空的(4行)PTX,但在尝试使用它之前不会发出警告或错误。nvcc --version
:Cuda编译工具,版本10.1,V10.1.168。建立于Wed_Apr_24_19:10:27_PDT_2019gcc --version
:gcc(Ubuntu 7.5.0-3ubuntu1〜18.04)7.5.0我忘记添加我的环境。请参阅上一节。
您还可以使用ptxas编译nvrtc输出吗? – @talonmies的评论
nvcc
生成的PTX编译时显示警告:$ ptxas -o /tmp/temp_ptxas_output.o kernel.ptx ptxas warning : Stack size for entry function 'raytrace_kernel' cannot be statically determined
这归因于递归内核函数(more on that)。可以安全地忽略它。
nvrtc
生成的PTX会进行编译,并发出错误:
$ ptxas -o /tmp/temp_ptxas_output.o nvrtc_kernel.ptx
ptxas fatal : Unresolved extern function '_Z5powiffi'
基于this question,我向__device__
类构造函数添加了Sphere
,并删除了--device-as-default-execution-space
编译器选项。它现在生成的PTX稍有不同,但是仍然会出现相同的错误。
现在使用#include <math.h>
进行编译会产生很多“没有执行空间注释的函数被认为是宿主函数,并且在JIT模式下不允许使用宿主函数。”警告,除了先前的错误。
如果我尝试使用accepted solution of the question,则会抛出一系列语法错误,并且无法编译。 NVCC仍然可以正常工作。
总结,我正在将基于Scratchapixel版本的简单光线跟踪应用程序移植到一堆GPU库中。我使用运行时API和驱动程序API成功地将其移植到CUDA,但它...
pow
方法内部用于计算菲涅耳效果的trace
调用,错误就会消失。目前,我只是将pow(var, 3)
替换为var*var*var
。
[我将尝试创建MVCE以便最终向NVIDIA提交错误报告...
pow
方法内部用于计算菲涅耳效果的trace
调用,错误就会消失。目前,我只是将pow(var, 3)
替换为var*var*var
。
[我将尝试创建MVCE以便最终向NVIDIA提交错误报告...
以上是关于NVCC和NVRTC在编译为PTX时的差异的主要内容,如果未能解决你的问题,请参考以下文章
PTX 和 CUBIN w.r.t 有啥区别? NVCC 编译器?