Spaces:
Runtime error
Runtime error
| // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
| // | |
| // NVIDIA CORPORATION and its licensors retain all intellectual property | |
| // and proprietary rights in and to this software, related documentation | |
| // and any modifications thereto. Any use, reproduction, disclosure or | |
| // distribution of this software and related documentation without an express | |
| // license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| //------------------------------------------------------------------------ | |
| // Helpers. | |
| static int ROUND_UP_BITS(uint32_t x, uint32_t y) | |
| { | |
| // Round x up so that it has at most y bits of mantissa. | |
| if (x < (1u << y)) | |
| return x; | |
| uint32_t m = 0; | |
| while (x & ~m) | |
| m = (m << 1) | 1u; | |
| m >>= y; | |
| if (!(x & m)) | |
| return x; | |
| return (x | m) + 1u; | |
| } | |
| //------------------------------------------------------------------------ | |
| // Draw command struct used by rasterizer. | |
| struct GLDrawCmd | |
| { | |
| uint32_t count; | |
| uint32_t instanceCount; | |
| uint32_t firstIndex; | |
| uint32_t baseVertex; | |
| uint32_t baseInstance; | |
| }; | |
| //------------------------------------------------------------------------ | |
| // GL helpers. | |
| static void compileGLShader(NVDR_CTX_ARGS, const RasterizeGLState& s, GLuint* pShader, GLenum shaderType, const char* src_buf) | |
| { | |
| std::string src(src_buf); | |
| // Set preprocessor directives. | |
| int n = src.find('\n') + 1; // After first line containing #version directive. | |
| if (s.enableZModify) | |
| src.insert(n, "#define IF_ZMODIFY(x) x\n"); | |
| else | |
| src.insert(n, "#define IF_ZMODIFY(x)\n"); | |
| const char *cstr = src.c_str(); | |
| *pShader = 0; | |
| NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType)); | |
| NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &cstr, 0)); | |
| NVDR_CHECK_GL_ERROR(glCompileShader(*pShader)); | |
| } | |
| static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader) | |
| { | |
| *pProgram = 0; | |
| GLuint glProgram = 0; | |
| NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram()); | |
| NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader)); | |
| NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader)); | |
| NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader)); | |
| NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram)); | |
| GLint linkStatus = 0; | |
| NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus)); | |
| if (!linkStatus) | |
| { | |
| GLint infoLen = 0; | |
| NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen)); | |
| if (infoLen) | |
| { | |
| const char* hdr = "glLinkProgram() failed:\n"; | |
| std::vector<char> info(strlen(hdr) + infoLen); | |
| strcpy(&info[0], hdr); | |
| NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)])); | |
| NVDR_CHECK(0, &info[0]); | |
| } | |
| NVDR_CHECK(0, "glLinkProgram() failed"); | |
| } | |
| *pProgram = glProgram; | |
| } | |
| //------------------------------------------------------------------------ | |
| // Shared C++ functions. | |
| void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx) | |
| { | |
| // Create GL context and set it current. | |
| s.glctx = createGLContext(cudaDeviceIdx); | |
| setGLContext(s.glctx); | |
| // Version check. | |
| GLint vMajor = 0; | |
| GLint vMinor = 0; | |
| glGetIntegerv(GL_MAJOR_VERSION, &vMajor); | |
| glGetIntegerv(GL_MINOR_VERSION, &vMinor); | |
| glGetError(); // Clear possible GL_INVALID_ENUM error in version query. | |
| LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor; | |
| NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required"); | |
| // Enable depth modification workaround on A100 and later. | |
| int capMajor = 0; | |
| NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&capMajor, cudaDevAttrComputeCapabilityMajor, cudaDeviceIdx)); | |
| s.enableZModify = (capMajor >= 8); | |
| // Number of output buffers. | |
| int num_outputs = s.enableDB ? 2 : 1; | |
| // Set up vertex shader. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glVertexShader, GL_VERTEX_SHADER, | |
| "#version 330\n" | |
| "#extension GL_ARB_shader_draw_parameters : enable\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| layout(location = 0) in vec4 in_pos; | |
| out int v_layer; | |
| out int v_offset; | |
| void main() | |
| { | |
| int layer = gl_DrawIDARB; | |
| gl_Position = in_pos; | |
| v_layer = layer; | |
| v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here. | |
| } | |
| ) | |
| ); | |
| // Geometry and fragment shaders depend on if bary differential output is enabled or not. | |
| if (s.enableDB) | |
| { | |
| // Set up geometry shader. Calculation of per-pixel bary differentials is based on: | |
| // u = (u/w) / (1/w) | |
| // --> du/dX = d((u/w) / (1/w))/dX | |
| // --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w | |
| // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER, | |
| "#version 430\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| layout(triangles) in; | |
| layout(triangle_strip, max_vertices=3) out; | |
| layout(location = 0) uniform vec2 vp_scale; | |
| in int v_layer[]; | |
| in int v_offset[]; | |
| out vec4 var_uvzw; | |
| out vec4 var_db; | |
| void main() | |
| { | |
| // Plane equations for bary differentials. | |
| float w0 = gl_in[0].gl_Position.w; | |
| float w1 = gl_in[1].gl_Position.w; | |
| float w2 = gl_in[2].gl_Position.w; | |
| vec2 p0 = gl_in[0].gl_Position.xy; | |
| vec2 p1 = gl_in[1].gl_Position.xy; | |
| vec2 p2 = gl_in[2].gl_Position.xy; | |
| vec2 e0 = p0*w2 - p2*w0; | |
| vec2 e1 = p1*w2 - p2*w1; | |
| float a = e0.x*e1.y - e0.y*e1.x; | |
| // Clamp area to an epsilon to avoid arbitrarily high bary differentials. | |
| float eps = 1e-6f; // ~1 pixel in 1k x 1k image. | |
| float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign. | |
| float ia = 1.f / ca; // Inverse area. | |
| vec2 ascl = ia * vp_scale; | |
| float dudx = e1.y * ascl.x; | |
| float dudy = -e1.x * ascl.y; | |
| float dvdx = -e0.y * ascl.x; | |
| float dvdy = e0.x * ascl.y; | |
| float duwdx = w2 * dudx; | |
| float dvwdx = w2 * dvdx; | |
| float duvdx = w0 * dudx + w1 * dvdx; | |
| float duwdy = w2 * dudy; | |
| float dvwdy = w2 * dvdy; | |
| float duvdy = w0 * dudy + w1 * dvdy; | |
| vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy); | |
| vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy); | |
| vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy); | |
| int layer_id = v_layer[0]; | |
| int prim_id = gl_PrimitiveIDIn + v_offset[0]; | |
| gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex(); | |
| gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex(); | |
| gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex(); | |
| } | |
| ) | |
| ); | |
| // Set up fragment shader. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER, | |
| "#version 430\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| in vec4 var_uvzw; | |
| in vec4 var_db; | |
| layout(location = 0) out vec4 out_raster; | |
| layout(location = 1) out vec4 out_db; | |
| IF_ZMODIFY( | |
| layout(location = 1) uniform float in_dummy; | |
| ) | |
| void main() | |
| { | |
| int id_int = gl_PrimitiveID + 1; | |
| float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); | |
| out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float); | |
| out_db = var_db * var_uvzw.w; | |
| IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) | |
| } | |
| ) | |
| ); | |
| // Set up fragment shader for depth peeling. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER, | |
| "#version 430\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| in vec4 var_uvzw; | |
| in vec4 var_db; | |
| layout(binding = 0) uniform sampler2DArray out_prev; | |
| layout(location = 0) out vec4 out_raster; | |
| layout(location = 1) out vec4 out_db; | |
| IF_ZMODIFY( | |
| layout(location = 1) uniform float in_dummy; | |
| ) | |
| void main() | |
| { | |
| int id_int = gl_PrimitiveID + 1; | |
| float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); | |
| vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0); | |
| float depth_new = var_uvzw.z / var_uvzw.w; | |
| if (prev.w == 0 || depth_new <= prev.z) | |
| discard; | |
| out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, id_float); | |
| out_db = var_db * var_uvzw.w; | |
| IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) | |
| } | |
| ) | |
| ); | |
| } | |
| else | |
| { | |
| // Geometry shader without bary differential output. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER, | |
| "#version 330\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| layout(triangles) in; | |
| layout(triangle_strip, max_vertices=3) out; | |
| in int v_layer[]; | |
| in int v_offset[]; | |
| out vec4 var_uvzw; | |
| void main() | |
| { | |
| int layer_id = v_layer[0]; | |
| int prim_id = gl_PrimitiveIDIn + v_offset[0]; | |
| gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex(); | |
| gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex(); | |
| gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex(); | |
| } | |
| ) | |
| ); | |
| // Fragment shader without bary differential output. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER, | |
| "#version 430\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| in vec4 var_uvzw; | |
| layout(location = 0) out vec4 out_raster; | |
| IF_ZMODIFY( | |
| layout(location = 1) uniform float in_dummy; | |
| ) | |
| void main() | |
| { | |
| int id_int = gl_PrimitiveID + 1; | |
| float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); | |
| out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float); | |
| IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) | |
| } | |
| ) | |
| ); | |
| // Depth peeling variant of fragment shader. | |
| compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER, | |
| "#version 430\n" | |
| STRINGIFY_SHADER_SOURCE( | |
| in vec4 var_uvzw; | |
| layout(binding = 0) uniform sampler2DArray out_prev; | |
| layout(location = 0) out vec4 out_raster; | |
| IF_ZMODIFY( | |
| layout(location = 1) uniform float in_dummy; | |
| ) | |
| void main() | |
| { | |
| int id_int = gl_PrimitiveID + 1; | |
| float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int); | |
| vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0); | |
| float depth_new = var_uvzw.z / var_uvzw.w; | |
| if (prev.w == 0 || depth_new <= prev.z) | |
| discard; | |
| out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float); | |
| IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;) | |
| } | |
| ) | |
| ); | |
| } | |
| // Finalize programs. | |
| constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader); | |
| constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP); | |
| // Construct main fbo and bind permanently. | |
| NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO)); | |
| NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO)); | |
| // Enable two color attachments. | |
| GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 }; | |
| NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers)); | |
| // Construct vertex array object. | |
| NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO)); | |
| NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO)); | |
| // Construct position buffer, bind permanently, enable, set ptr. | |
| NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer)); | |
| NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer)); | |
| NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0)); | |
| NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0)); | |
| // Construct index buffer and bind permanently. | |
| NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer)); | |
| NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer)); | |
| // Set up depth test. | |
| NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST)); | |
| NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS)); | |
| NVDR_CHECK_GL_ERROR(glClearDepth(1.0)); | |
| // Create and bind output buffers. Storage is allocated later. | |
| NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer)); | |
| for (int i=0; i < num_outputs; i++) | |
| { | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i])); | |
| NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0)); | |
| } | |
| // Create and bind depth/stencil buffer. Storage is allocated later. | |
| NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer)); | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer)); | |
| NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0)); | |
| // Create texture name for previous output buffer (depth peeling). | |
| NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer)); | |
| } | |
| void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth) | |
| { | |
| changes = false; | |
| // Resize vertex buffer? | |
| if (posCount > s.posCount) | |
| { | |
| if (s.cudaPosBuffer) | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer)); | |
| s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64; | |
| LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32"; | |
| NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard)); | |
| changes = true; | |
| } | |
| // Resize triangle buffer? | |
| if (triCount > s.triCount) | |
| { | |
| if (s.cudaTriBuffer) | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer)); | |
| s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64; | |
| LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32"; | |
| NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard)); | |
| changes = true; | |
| } | |
| // Resize framebuffer? | |
| if (width > s.width || height > s.height || depth > s.depth) | |
| { | |
| int num_outputs = s.enableDB ? 2 : 1; | |
| if (s.cudaColorBuffer[0]) | |
| for (int i=0; i < num_outputs; i++) | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i])); | |
| if (s.cudaPrevOutBuffer) | |
| { | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer)); | |
| s.cudaPrevOutBuffer = 0; | |
| } | |
| // New framebuffer size. | |
| s.width = (width > s.width) ? width : s.width; | |
| s.height = (height > s.height) ? height : s.height; | |
| s.depth = (depth > s.depth) ? depth : s.depth; | |
| s.width = ROUND_UP(s.width, 32); | |
| s.height = ROUND_UP(s.height, 32); | |
| LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")"; | |
| // Allocate color buffers. | |
| for (int i=0; i < num_outputs; i++) | |
| { | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i])); | |
| NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE)); | |
| } | |
| // Allocate depth/stencil buffer. | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer)); | |
| NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0)); | |
| // (Re-)register all GL buffers into Cuda. | |
| for (int i=0; i < num_outputs; i++) | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly)); | |
| changes = true; | |
| } | |
| } | |
| void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx) | |
| { | |
| // Only copy inputs if we are on first iteration of depth peeling or not doing it at all. | |
| if (peeling_idx < 1) | |
| { | |
| if (triPtr) | |
| { | |
| // Copy both position and triangle buffers. | |
| void* glPosPtr = NULL; | |
| void* glTriPtr = NULL; | |
| size_t posBytes = 0; | |
| size_t triBytes = 0; | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer)); | |
| NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch"); | |
| NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch"); | |
| NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream)); | |
| NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream)); | |
| } | |
| else | |
| { | |
| // Copy position buffer only. Triangles are already copied and known to be constant. | |
| void* glPosPtr = NULL; | |
| size_t posBytes = 0; | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer)); | |
| NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch"); | |
| NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream)); | |
| } | |
| } | |
| // Select program based on whether we have a depth peeling input or not. | |
| if (peeling_idx < 1) | |
| { | |
| // Normal case: No peeling, or peeling disabled. | |
| NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram)); | |
| } | |
| else | |
| { | |
| // If we don't have a third buffer yet, create one. | |
| if (!s.cudaPrevOutBuffer) | |
| { | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer)); | |
| NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE)); | |
| NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE)); | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly)); | |
| } | |
| // Swap the GL buffers. | |
| GLuint glTempBuffer = s.glPrevOutBuffer; | |
| s.glPrevOutBuffer = s.glColorBuffer[0]; | |
| s.glColorBuffer[0] = glTempBuffer; | |
| // Swap the Cuda buffers. | |
| cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer; | |
| s.cudaPrevOutBuffer = s.cudaColorBuffer[0]; | |
| s.cudaColorBuffer[0] = cudaTempBuffer; | |
| // Bind the new output buffer. | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0])); | |
| NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0)); | |
| // Bind old buffer as the input texture. | |
| NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer)); | |
| // Activate the correct program. | |
| NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP)); | |
| } | |
| // Set viewport, clear color buffer(s) and depth/stencil buffer. | |
| NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height)); | |
| NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT)); | |
| // If outputting bary differentials, set resolution uniform | |
| if (s.enableDB) | |
| NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height)); | |
| // Set the dummy uniform if depth modification workaround is active. | |
| if (s.enableZModify) | |
| NVDR_CHECK_GL_ERROR(glUniform1f(1, 0.f)); | |
| // Render the meshes. | |
| if (depth == 1 && !rangesPtr) | |
| { | |
| // Trivial case. | |
| NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0)); | |
| } | |
| else | |
| { | |
| // Populate a buffer for draw commands and execute it. | |
| std::vector<GLDrawCmd> drawCmdBuffer(depth); | |
| if (!rangesPtr) | |
| { | |
| // Fill in range array to instantiate the same triangles for each output layer. | |
| // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to | |
| // the first dimension in addressing the triangle array. | |
| for (int i=0; i < depth; i++) | |
| { | |
| GLDrawCmd& cmd = drawCmdBuffer[i]; | |
| cmd.firstIndex = 0; | |
| cmd.count = triCount; | |
| cmd.baseVertex = vtxPerInstance * i; | |
| cmd.baseInstance = 0; | |
| cmd.instanceCount = 1; | |
| } | |
| } | |
| else | |
| { | |
| // Fill in the range array according to user-given ranges. Triangle IDs point | |
| // to the input triangle array, NOT index within range, so they correspond to | |
| // the first dimension in addressing the triangle array. | |
| for (int i=0, j=0; i < depth; i++) | |
| { | |
| GLDrawCmd& cmd = drawCmdBuffer[i]; | |
| int first = rangesPtr[j++]; | |
| int count = rangesPtr[j++]; | |
| NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values"); | |
| NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer"); | |
| cmd.firstIndex = first * 3; | |
| cmd.count = count * 3; | |
| cmd.baseVertex = 0; | |
| cmd.baseInstance = first; | |
| cmd.instanceCount = 1; | |
| } | |
| } | |
| // Draw! | |
| NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &drawCmdBuffer[0], depth, sizeof(GLDrawCmd))); | |
| } | |
| } | |
| void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth) | |
| { | |
| // Copy color buffers to output tensors. | |
| cudaArray_t array = 0; | |
| cudaChannelFormatDesc arrayDesc = {}; // For error checking. | |
| cudaExtent arrayExt = {}; // For error checking. | |
| int num_outputs = s.enableDB ? 2 : 1; | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream)); | |
| for (int i=0; i < num_outputs; i++) | |
| { | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0)); | |
| NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array)); | |
| NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch"); | |
| NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch"); | |
| NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch"); | |
| cudaMemcpy3DParms p = {0}; | |
| p.srcArray = array; | |
| p.dstPtr.ptr = outputPtr[i]; | |
| p.dstPtr.pitch = width * 4 * sizeof(float); | |
| p.dstPtr.xsize = width; | |
| p.dstPtr.ysize = height; | |
| p.extent.width = width; | |
| p.extent.height = height; | |
| p.extent.depth = depth; | |
| p.kind = cudaMemcpyDeviceToDevice; | |
| NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream)); | |
| } | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream)); | |
| } | |
| void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s) | |
| { | |
| int num_outputs = s.enableDB ? 2 : 1; | |
| if (s.cudaPosBuffer) | |
| { | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer)); | |
| s.cudaPosBuffer = 0; | |
| } | |
| if (s.cudaTriBuffer) | |
| { | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer)); | |
| s.cudaTriBuffer = 0; | |
| } | |
| for (int i=0; i < num_outputs; i++) | |
| { | |
| if (s.cudaColorBuffer[i]) | |
| { | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i])); | |
| s.cudaColorBuffer[i] = 0; | |
| } | |
| } | |
| if (s.cudaPrevOutBuffer) | |
| { | |
| NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer)); | |
| s.cudaPrevOutBuffer = 0; | |
| } | |
| } | |
| //------------------------------------------------------------------------ | |