TRELLIS_dev

Runtime error

App Files Files Community

TRELLIS_dev / extensions /nvdiffrast /nvdiffrast /common /rasterize_gl.cpp

JeffreyXiang

update

15fe7bc about 1 year ago

raw

history blame contribute delete

29.4 kB

	// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	//
	// NVIDIA CORPORATION and its licensors retain all intellectual property
	// and proprietary rights in and to this software, related documentation
	// and any modifications thereto. Any use, reproduction, disclosure or
	// distribution of this software and related documentation without an express
	// license agreement from NVIDIA CORPORATION is strictly prohibited.

	#include "rasterize_gl.h"
	#include "glutil.h"
	#include <vector>
	#define STRINGIFY_SHADER_SOURCE(x) #x

	//------------------------------------------------------------------------
	// Helpers.

	#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
	static int ROUND_UP_BITS(uint32_t x, uint32_t y)
	{
	// Round x up so that it has at most y bits of mantissa.
	if (x < (1u << y))
	return x;
	uint32_t m = 0;
	while (x & ~m)
	m = (m << 1) \| 1u;
	m >>= y;
	if (!(x & m))
	return x;
	return (x \| m) + 1u;
	}

	//------------------------------------------------------------------------
	// Draw command struct used by rasterizer.

	struct GLDrawCmd
	{
	uint32_t count;
	uint32_t instanceCount;
	uint32_t firstIndex;
	uint32_t baseVertex;
	uint32_t baseInstance;
	};

	//------------------------------------------------------------------------
	// GL helpers.

	static void compileGLShader(NVDR_CTX_ARGS, const RasterizeGLState& s, GLuint* pShader, GLenum shaderType, const char* src_buf)
	{
	std::string src(src_buf);

	// Set preprocessor directives.
	int n = src.find('\n') + 1; // After first line containing #version directive.
	if (s.enableZModify)
	src.insert(n, "#define IF_ZMODIFY(x) x\n");
	else
	src.insert(n, "#define IF_ZMODIFY(x)\n");

	const char *cstr = src.c_str();
	*pShader = 0;
	NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
	NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &cstr, 0));
	NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
	}

	static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
	{
	*pProgram = 0;

	GLuint glProgram = 0;
	NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
	NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
	NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
	NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
	NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));

	GLint linkStatus = 0;
	NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
	if (!linkStatus)
	{
	GLint infoLen = 0;
	NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
	if (infoLen)
	{
	const char* hdr = "glLinkProgram() failed:\n";
	std::vector<char> info(strlen(hdr) + infoLen);
	strcpy(&info[0], hdr);
	NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
	NVDR_CHECK(0, &info[0]);
	}
	NVDR_CHECK(0, "glLinkProgram() failed");
	}

	*pProgram = glProgram;
	}

	//------------------------------------------------------------------------
	// Shared C++ functions.

	void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
	{
	// Create GL context and set it current.
	s.glctx = createGLContext(cudaDeviceIdx);
	setGLContext(s.glctx);

	// Version check.
	GLint vMajor = 0;
	GLint vMinor = 0;
	glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
	glGetIntegerv(GL_MINOR_VERSION, &vMinor);
	glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
	LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
	NVDR_CHECK((vMajor == 4 && vMinor >= 4) \|\| vMajor > 4, "OpenGL 4.4 or later is required");

	// Enable depth modification workaround on A100 and later.
	int capMajor = 0;
	NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&capMajor, cudaDevAttrComputeCapabilityMajor, cudaDeviceIdx));
	s.enableZModify = (capMajor >= 8);

	// Number of output buffers.
	int num_outputs = s.enableDB ? 2 : 1;

	// Set up vertex shader.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glVertexShader, GL_VERTEX_SHADER,
	"#version 330\n"
	"#extension GL_ARB_shader_draw_parameters : enable\n"
	STRINGIFY_SHADER_SOURCE(
	layout(location = 0) in vec4 in_pos;
	out int v_layer;
	out int v_offset;
	void main()
	{
	int layer = gl_DrawIDARB;
	gl_Position = in_pos;
	v_layer = layer;
	v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
	}
	)
	);

	// Geometry and fragment shaders depend on if bary differential output is enabled or not.
	if (s.enableDB)
	{
	// Set up geometry shader. Calculation of per-pixel bary differentials is based on:
	// u = (u/w) / (1/w)
	// --> du/dX = d((u/w) / (1/w))/dX
	// --> du/dX = [d(u/w)/dX - ud(1/w)/dX] w
	// and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
	"#version 430\n"
	STRINGIFY_SHADER_SOURCE(
	layout(triangles) in;
	layout(triangle_strip, max_vertices=3) out;
	layout(location = 0) uniform vec2 vp_scale;
	in int v_layer[];
	in int v_offset[];
	out vec4 var_uvzw;
	out vec4 var_db;
	void main()
	{
	// Plane equations for bary differentials.
	float w0 = gl_in[0].gl_Position.w;
	float w1 = gl_in[1].gl_Position.w;
	float w2 = gl_in[2].gl_Position.w;
	vec2 p0 = gl_in[0].gl_Position.xy;
	vec2 p1 = gl_in[1].gl_Position.xy;
	vec2 p2 = gl_in[2].gl_Position.xy;
	vec2 e0 = p0w2 - p2w0;
	vec2 e1 = p1w2 - p2w1;
	float a = e0.xe1.y - e0.ye1.x;

	// Clamp area to an epsilon to avoid arbitrarily high bary differentials.
	float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
	float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
	float ia = 1.f / ca; // Inverse area.

	vec2 ascl = ia * vp_scale;
	float dudx = e1.y * ascl.x;
	float dudy = -e1.x * ascl.y;
	float dvdx = -e0.y * ascl.x;
	float dvdy = e0.x * ascl.y;

	float duwdx = w2 * dudx;
	float dvwdx = w2 * dvdx;
	float duvdx = w0 * dudx + w1 * dvdx;
	float duwdy = w2 * dudy;
	float dvwdy = w2 * dvdy;
	float duvdy = w0 * dudy + w1 * dvdy;

	vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
	vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
	vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);

	int layer_id = v_layer[0];
	int prim_id = gl_PrimitiveIDIn + v_offset[0];

	gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
	gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
	gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
	}
	)
	);

	// Set up fragment shader.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
	"#version 430\n"
	STRINGIFY_SHADER_SOURCE(
	in vec4 var_uvzw;
	in vec4 var_db;
	layout(location = 0) out vec4 out_raster;
	layout(location = 1) out vec4 out_db;
	IF_ZMODIFY(
	layout(location = 1) uniform float in_dummy;
	)
	void main()
	{
	int id_int = gl_PrimitiveID + 1;
	float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);

	out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
	out_db = var_db * var_uvzw.w;
	IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
	}
	)
	);

	// Set up fragment shader for depth peeling.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
	"#version 430\n"
	STRINGIFY_SHADER_SOURCE(
	in vec4 var_uvzw;
	in vec4 var_db;
	layout(binding = 0) uniform sampler2DArray out_prev;
	layout(location = 0) out vec4 out_raster;
	layout(location = 1) out vec4 out_db;
	IF_ZMODIFY(
	layout(location = 1) uniform float in_dummy;
	)
	void main()
	{
	int id_int = gl_PrimitiveID + 1;
	float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);

	vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
	float depth_new = var_uvzw.z / var_uvzw.w;
	if (prev.w == 0 \|\| depth_new <= prev.z)
	discard;
	out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, id_float);
	out_db = var_db * var_uvzw.w;
	IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
	}
	)
	);
	}
	else
	{
	// Geometry shader without bary differential output.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
	"#version 330\n"
	STRINGIFY_SHADER_SOURCE(
	layout(triangles) in;
	layout(triangle_strip, max_vertices=3) out;
	in int v_layer[];
	in int v_offset[];
	out vec4 var_uvzw;
	void main()
	{
	int layer_id = v_layer[0];
	int prim_id = gl_PrimitiveIDIn + v_offset[0];

	gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
	gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
	gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
	}
	)
	);

	// Fragment shader without bary differential output.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
	"#version 430\n"
	STRINGIFY_SHADER_SOURCE(
	in vec4 var_uvzw;
	layout(location = 0) out vec4 out_raster;
	IF_ZMODIFY(
	layout(location = 1) uniform float in_dummy;
	)
	void main()
	{
	int id_int = gl_PrimitiveID + 1;
	float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);

	out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
	IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
	}
	)
	);

	// Depth peeling variant of fragment shader.
	compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
	"#version 430\n"
	STRINGIFY_SHADER_SOURCE(
	in vec4 var_uvzw;
	layout(binding = 0) uniform sampler2DArray out_prev;
	layout(location = 0) out vec4 out_raster;
	IF_ZMODIFY(
	layout(location = 1) uniform float in_dummy;
	)
	void main()
	{
	int id_int = gl_PrimitiveID + 1;
	float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);

	vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
	float depth_new = var_uvzw.z / var_uvzw.w;
	if (prev.w == 0 \|\| depth_new <= prev.z)
	discard;
	out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
	IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
	}
	)
	);
	}

	// Finalize programs.
	constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
	constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP);

	// Construct main fbo and bind permanently.
	NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
	NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));

	// Enable two color attachments.
	GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
	NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));

	// Construct vertex array object.
	NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
	NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));

	// Construct position buffer, bind permanently, enable, set ptr.
	NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
	NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
	NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
	NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));

	// Construct index buffer and bind permanently.
	NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
	NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));

	// Set up depth test.
	NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
	NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
	NVDR_CHECK_GL_ERROR(glClearDepth(1.0));

	// Create and bind output buffers. Storage is allocated later.
	NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
	for (int i=0; i < num_outputs; i++)
	{
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
	NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
	}

	// Create and bind depth/stencil buffer. Storage is allocated later.
	NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
	NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));

	// Create texture name for previous output buffer (depth peeling).
	NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
	}

	void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
	{
	changes = false;

	// Resize vertex buffer?
	if (posCount > s.posCount)
	{
	if (s.cudaPosBuffer)
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
	s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
	LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
	NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
	changes = true;
	}

	// Resize triangle buffer?
	if (triCount > s.triCount)
	{
	if (s.cudaTriBuffer)
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
	s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
	LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
	NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
	changes = true;
	}

	// Resize framebuffer?
	if (width > s.width \|\| height > s.height \|\| depth > s.depth)
	{
	int num_outputs = s.enableDB ? 2 : 1;
	if (s.cudaColorBuffer[0])
	for (int i=0; i < num_outputs; i++)
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));

	if (s.cudaPrevOutBuffer)
	{
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
	s.cudaPrevOutBuffer = 0;
	}

	// New framebuffer size.
	s.width = (width > s.width) ? width : s.width;
	s.height = (height > s.height) ? height : s.height;
	s.depth = (depth > s.depth) ? depth : s.depth;
	s.width = ROUND_UP(s.width, 32);
	s.height = ROUND_UP(s.height, 32);
	LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";

	// Allocate color buffers.
	for (int i=0; i < num_outputs; i++)
	{
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
	NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
	}

	// Allocate depth/stencil buffer.
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
	NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));

	// (Re-)register all GL buffers into Cuda.
	for (int i=0; i < num_outputs; i++)
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));

	changes = true;
	}
	}

	void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
	{
	// Only copy inputs if we are on first iteration of depth peeling or not doing it at all.
	if (peeling_idx < 1)
	{
	if (triPtr)
	{
	// Copy both position and triangle buffers.
	void* glPosPtr = NULL;
	void* glTriPtr = NULL;
	size_t posBytes = 0;
	size_t triBytes = 0;
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
	NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
	NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
	NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
	NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
	}
	else
	{
	// Copy position buffer only. Triangles are already copied and known to be constant.
	void* glPosPtr = NULL;
	size_t posBytes = 0;
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
	NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
	NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
	}
	}

	// Select program based on whether we have a depth peeling input or not.
	if (peeling_idx < 1)
	{
	// Normal case: No peeling, or peeling disabled.
	NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
	}
	else
	{
	// If we don't have a third buffer yet, create one.
	if (!s.cudaPrevOutBuffer)
	{
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
	NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
	NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
	}

	// Swap the GL buffers.
	GLuint glTempBuffer = s.glPrevOutBuffer;
	s.glPrevOutBuffer = s.glColorBuffer[0];
	s.glColorBuffer[0] = glTempBuffer;

	// Swap the Cuda buffers.
	cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer;
	s.cudaPrevOutBuffer = s.cudaColorBuffer[0];
	s.cudaColorBuffer[0] = cudaTempBuffer;

	// Bind the new output buffer.
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0]));
	NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0));

	// Bind old buffer as the input texture.
	NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));

	// Activate the correct program.
	NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP));
	}

	// Set viewport, clear color buffer(s) and depth/stencil buffer.
	NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
	NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT \| GL_DEPTH_BUFFER_BIT \| GL_STENCIL_BUFFER_BIT));

	// If outputting bary differentials, set resolution uniform
	if (s.enableDB)
	NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));

	// Set the dummy uniform if depth modification workaround is active.
	if (s.enableZModify)
	NVDR_CHECK_GL_ERROR(glUniform1f(1, 0.f));

	// Render the meshes.
	if (depth == 1 && !rangesPtr)
	{
	// Trivial case.
	NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
	}
	else
	{
	// Populate a buffer for draw commands and execute it.
	std::vector<GLDrawCmd> drawCmdBuffer(depth);

	if (!rangesPtr)
	{
	// Fill in range array to instantiate the same triangles for each output layer.
	// Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
	// the first dimension in addressing the triangle array.
	for (int i=0; i < depth; i++)
	{
	GLDrawCmd& cmd = drawCmdBuffer[i];
	cmd.firstIndex = 0;
	cmd.count = triCount;
	cmd.baseVertex = vtxPerInstance * i;
	cmd.baseInstance = 0;
	cmd.instanceCount = 1;
	}
	}
	else
	{
	// Fill in the range array according to user-given ranges. Triangle IDs point
	// to the input triangle array, NOT index within range, so they correspond to
	// the first dimension in addressing the triangle array.
	for (int i=0, j=0; i < depth; i++)
	{
	GLDrawCmd& cmd = drawCmdBuffer[i];
	int first = rangesPtr[j++];
	int count = rangesPtr[j++];
	NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
	NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
	cmd.firstIndex = first * 3;
	cmd.count = count * 3;
	cmd.baseVertex = 0;
	cmd.baseInstance = first;
	cmd.instanceCount = 1;
	}
	}

	// Draw!
	NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
	}
	}

	void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
	{
	// Copy color buffers to output tensors.
	cudaArray_t array = 0;
	cudaChannelFormatDesc arrayDesc = {}; // For error checking.
	cudaExtent arrayExt = {}; // For error checking.
	int num_outputs = s.enableDB ? 2 : 1;
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
	for (int i=0; i < num_outputs; i++)
	{
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
	NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
	NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
	NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
	NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
	cudaMemcpy3DParms p = {0};
	p.srcArray = array;
	p.dstPtr.ptr = outputPtr[i];
	p.dstPtr.pitch = width * 4 * sizeof(float);
	p.dstPtr.xsize = width;
	p.dstPtr.ysize = height;
	p.extent.width = width;
	p.extent.height = height;
	p.extent.depth = depth;
	p.kind = cudaMemcpyDeviceToDevice;
	NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
	}
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
	}

	void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s)
	{
	int num_outputs = s.enableDB ? 2 : 1;

	if (s.cudaPosBuffer)
	{
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
	s.cudaPosBuffer = 0;
	}

	if (s.cudaTriBuffer)
	{
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
	s.cudaTriBuffer = 0;
	}

	for (int i=0; i < num_outputs; i++)
	{
	if (s.cudaColorBuffer[i])
	{
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
	s.cudaColorBuffer[i] = 0;
	}
	}

	if (s.cudaPrevOutBuffer)
	{
	NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
	s.cudaPrevOutBuffer = 0;
	}
	}

	//------------------------------------------------------------------------