Spaces:

geyik1
/

dnm3d

Running

App Files Files Community

geyik1 commited on Jan 31

Commit

111ba01

verified ·

1 Parent(s): 207d5db

Upload 174 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
3d.mp4 +3 -0
README.md +11 -0
app.py +2 -0
assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp +3 -0
assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp +3 -0
assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp +0 -0
assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp +0 -0
extensions/extensions_nvdiffrast_LICENSE.txt +97 -0
extensions/extensions_nvdiffrast_README.md +42 -0
extensions/extensions_nvdiffrast_run_sample.sh +52 -0
extensions/extensions_nvdiffrast_setup copy.py +51 -0
extensions/extensions_nvdiffrast_setup.py +82 -0
extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp +63 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl +423 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp +94 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp +55 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl +730 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Constants.hpp +73 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CudaRaster.cpp +79 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Defs.hpp +90 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_FineRaster.inl +385 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_PrivateDefs.hpp +153 -0
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_RasterImpl.cpp +370 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+3d.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/example_image/assets_example_image_image[[:space:]]-[[:space:]]2024-12-09T184251.254.webp filter=lfs diff=lfs merge=lfs -text
+assets/example_image/assets_example_image_image[[:space:]]-[[:space:]]2024-12-09T184511.907.webp filter=lfs diff=lfs merge=lfs -text
+wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+wheels/wheels_diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text

3d.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c3282465210bac76f44b605956139679ed774c8bad9be686707d1b770961371
+size 21309978

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: SORA 3D
+emoji: 🏢🏆
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+short_description: Create top-quality 3D(.GLB) models from text or images
+---

app.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import os
2	+ exec(os.environ.get('APP'))

assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp ADDED Viewed

Git LFS Details

SHA256: 04a741b7588b46f6f885987fa3330d51f671d7f372eedf3cc007e69fd1a2e3e9
Pointer size: 131 Bytes
Size of remote file: 113 kB

assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp ADDED Viewed

Git LFS Details

SHA256: f5cbfa61ca24164cafbd695aa6f12b617196a64f913e5c9964fad60a74dedda6
Pointer size: 131 Bytes
Size of remote file: 101 kB

assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp ADDED Viewed

assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp ADDED Viewed

extensions/extensions_nvdiffrast_LICENSE.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
+Nvidia Source Code License (1-Way Commercial)
+=======================================================================
+1. Definitions
+"Licensor" means any person or entity that distributes its Work.
+"Software" means the original work of authorship made available under
+this License.
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+2. License Grants
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+3. Limitations
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. The Work or
+    derivative works thereof may be used or intended for use by Nvidia
+    or its affiliates commercially or non-commercially. As used herein,
+    "non-commercially" means for research or evaluation purposes only
+    and not for any direct or indirect monetary gain.
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor's or its affiliates' names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+4. Disclaimer of Warranty.
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+5. Limitation of Liability.
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+=======================================================================

extensions/extensions_nvdiffrast_README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
+![Teaser image](./docs/img/teaser.png)
+**Modular Primitives for High-Performance Differentiable Rendering**<br>
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
+## Licenses
+Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+We do not currently accept outside code contributions in the form of pull requests.
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under
+[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+## Citation
+```
+@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}
+```

extensions/extensions_nvdiffrast_run_sample.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+#!/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+function print_help {
+    echo "Usage: `basename $0` [--build-container] <python_file>"
+    echo ""
+    echo "Option --build-container will build the Docker container based on"
+    echo "docker/Dockerfile and tag the image with gltorch:latest."
+    echo ""
+    echo "Example: `basename $0` samples/torch/envphong.py"
+}
+build_container=0
+sample=""
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --build-container) build_container=1;;
+        -h|--help) print_help; exit 0 ;;
+        --*) echo "Unknown parameter passed: $1"; exit 1 ;;
+        *) sample="$1"; shift; break;
+    esac
+    shift
+done
+rest=$@
+# Build the docker container
+if [ "$build_container" = "1" ]; then
+    docker build --tag gltorch:latest -f docker/Dockerfile .
+fi
+if [ ! -f "$sample" ]; then
+    echo
+    echo "No python sample given or file '$sample' not found.  Exiting."
+    exit 1
+fi
+image="gltorch:latest"
+echo "Using container image: $image"
+echo "Running command: $sample $rest"
+# Run a sample with docker
+docker run --rm -it --gpus all --user $(id -u):$(id -g) \
+    -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest

extensions/extensions_nvdiffrast_setup copy.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import nvdiffrast
+import setuptools
+import os
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="nvdiffrast",
+    version=nvdiffrast.__version__,
+    author="Samuli Laine",
+    author_email="slaine@nvidia.com",
+    description="nvdiffrast - modular primitives for high-performance differentiable rendering",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/NVlabs/nvdiffrast",
+    packages=setuptools.find_packages(),
+    package_data={
+        'nvdiffrast': [
+            'common/*.h',
+            'common/*.inl',
+            'common/*.cu',
+            'common/*.cpp',
+            'common/cudaraster/*.hpp',
+            'common/cudaraster/impl/*.cpp',
+            'common/cudaraster/impl/*.hpp',
+            'common/cudaraster/impl/*.inl',
+            'common/cudaraster/impl/*.cu',
+            'lib/*.h',
+            'torch/*.h',
+            'torch/*.inl',
+            'torch/*.cpp',
+            'tensorflow/*.cu',
+        ] + (['lib/*.lib'] if os.name == 'nt' else [])
+    },
+    include_package_data=True,
+    install_requires=['numpy'],  # note: can't require torch here as it will install torch even for a TensorFlow container
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)

extensions/extensions_nvdiffrast_setup.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import nvdiffrast
+import setuptools
+import os
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="nvdiffrast",
+    version=nvdiffrast.__version__,
+    author="Samuli Laine",
+    author_email="slaine@nvidia.com",
+    description="nvdiffrast - modular primitives for high-performance differentiable rendering",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/NVlabs/nvdiffrast",
+    packages=setuptools.find_packages(),
+    # package_data={
+    #     'nvdiffrast': [
+    #         'common/*.h',
+    #         'common/*.inl',
+    #         'common/*.cu',
+    #         'common/*.cpp',
+    #         'common/cudaraster/*.hpp',
+    #         'common/cudaraster/impl/*.cpp',
+    #         'common/cudaraster/impl/*.hpp',
+    #         'common/cudaraster/impl/*.inl',
+    #         'common/cudaraster/impl/*.cu',
+    #         'lib/*.h',
+    #         'torch/*.h',
+    #         'torch/*.inl',
+    #         'torch/*.cpp',
+    #         'tensorflow/*.cu',
+    #     ] + (['lib/*.lib'] if os.name == 'nt' else [])
+    # },
+    # include_package_data=True,
+    ext_modules=[
+        CUDAExtension(
+            name="nvdiffrast.torch._C",
+            sources=[
+                'nvdiffrast/common/cudaraster/impl/Buffer.cpp',
+                'nvdiffrast/common/cudaraster/impl/CudaRaster.cpp',
+                'nvdiffrast/common/cudaraster/impl/RasterImpl_.cu',
+                'nvdiffrast/common/cudaraster/impl/RasterImpl.cpp',
+                'nvdiffrast/common/common.cpp',
+                'nvdiffrast/common/rasterize.cu',
+                'nvdiffrast/common/interpolate.cu',
+                'nvdiffrast/common/texture_.cu',
+                'nvdiffrast/common/texture.cpp',
+                'nvdiffrast/common/antialias.cu',
+                'nvdiffrast/torch/torch_bindings.cpp',
+                'nvdiffrast/torch/torch_rasterize.cpp',
+                'nvdiffrast/torch/torch_interpolate.cpp',
+                'nvdiffrast/torch/torch_texture.cpp',
+                'nvdiffrast/torch/torch_antialias.cpp',
+            ],
+            extra_compile_args={
+                'cxx': ['-DNVDR_TORCH'],
+                'nvcc': ['-DNVDR_TORCH', '-lineinfo'],
+            },
+        )
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    },
+    install_requires=['numpy'],  # note: can't require torch here as it will install torch even for a TensorFlow container
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)

extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp ADDED Viewed

	@@ -0,0 +1,63 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+//------------------------------------------------------------------------
+// This is a slimmed-down and modernized version of the original
+// CudaRaster codebase that accompanied the HPG 2011 paper
+// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
+// Modifications have been made to accommodate post-Volta execution model
+// with warp divergence. Support for shading, blending, quad rendering,
+// and supersampling have been removed as unnecessary for nvdiffrast.
+//------------------------------------------------------------------------
+namespace CR
+{
+class RasterImpl;
+//------------------------------------------------------------------------
+// Interface class to isolate user from implementation details.
+//------------------------------------------------------------------------
+class CudaRaster
+{
+public:
+    enum
+    {
+        RenderModeFlag_EnableBackfaceCulling = 1 << 0,   // Enable backface culling.
+        RenderModeFlag_EnableDepthPeeling    = 1 << 1,   // Enable depth peeling. Must have a peel buffer set.
+    };
+public:
+					        CudaRaster				(void);
+					        ~CudaRaster				(void);
+    void                    setBufferSize           (int width, int height, int numImages);              // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
+    void                    setViewport             (int width, int height, int offsetX, int offsetY);   // Tiled rendering viewport setup.
+    void                    setRenderModeFlags      (unsigned int renderModeFlags);                      // Affects all subsequent calls to drawTriangles(). Defaults to zero.
+    void                    deferredClear           (unsigned int clearColor);                           // Clears color and depth buffers during next call to drawTriangles().
+    void                    setVertexBuffer         (void* vertices, int numVertices);                   // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
+    void                    setIndexBuffer          (void* indices, int numTriangles);                   // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
+    bool                    drawTriangles           (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
+    void*                   getColorBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void*                   getDepthBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void                    swapDepthAndPeel        (void);                                              // Swap depth and peeling buffers.
+private:
+					        CudaRaster           	(const CudaRaster&); // forbidden
+	CudaRaster&             operator=           	(const CudaRaster&); // forbidden
+private:
+    RasterImpl*             m_impl;                 // Opaque pointer to implementation.
+};
+//------------------------------------------------------------------------
+} // namespace CR

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl ADDED Viewed

	@@ -0,0 +1,423 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//------------------------------------------------------------------------
+__device__ __inline__ void binRasterImpl(const CRParams p)
+{
+    __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
+    __shared__ volatile S32 s_outOfs    [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outTotal  [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outMask   [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_outCount  [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_triBuf    [CR_BIN_WARPS*32*4];                // triangle ring buffer
+    __shared__ volatile U32 s_batchPos;
+    __shared__ volatile U32 s_bufCount;
+    __shared__ volatile U32 s_overTotal;
+    __shared__ volatile U32 s_allocBase;
+    const CRImageParams&    ip              = getImageParams(p, blockIdx.z);
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const U8*               triSubtris      = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+    S32*                    binFirstSeg     = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binTotal        = (S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binSegData      = (S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    S32*                    binSegNext      = (S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    S32*                    binSegCount     = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+    if (atomics.numSubtris > p.maxSubtris)
+        return;
+    // per-thread state
+    int thrInBlock = threadIdx.x + threadIdx.y * 32;
+    int batchPos = 0;
+    // first 16 elements of s_broadcast are always zero
+    if (thrInBlock < 16)
+        s_broadcast[thrInBlock] = 0;
+    // initialize output linked lists and offsets
+    if (thrInBlock < p.numBins)
+    {
+        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
+        s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
+        s_outTotal[thrInBlock] = 0;
+    }
+    // repeat until done
+    for(;;)
+    {
+        // get batch
+        if (thrInBlock == 0)
+            s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
+        __syncthreads();
+        batchPos = s_batchPos;
+        // all batches done?
+        if (batchPos >= ip.triCount)
+            break;
+        // per-thread state
+        int bufIndex = 0;
+        int bufCount = 0;
+        int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
+        // loop over batch as long as we have triangles in it
+        do
+        {
+            // read more triangles
+            while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
+            {
+                // get subtriangle count
+                int triIdx = batchPos + thrInBlock;
+                int num = 0;
+                if (triIdx < batchEnd)
+                    num = triSubtris[triIdx];
+                // cumulative sum of subtriangles within each warp
+                U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
+                if (__any_sync(~0u, num > 1))
+                {
+                    myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
+                    myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
+                }
+                if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
+                    s_broadcast[threadIdx.y + 16] = myIdx + num;
+                __syncthreads();
+                // cumulative sum of per-warp subtriangle counts
+                // Note: cannot have more than 32 warps or this needs to sync between each step.
+                bool act = (thrInBlock < CR_BIN_WARPS);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (threadIdx.y == 0 && act)
+                {
+                    volatile U32* ptr = &s_broadcast[thrInBlock + 16];
+                    U32 val = *ptr;
+                    #if (CR_BIN_WARPS > 1)
+                        val += ptr[-1]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 2)
+                        val += ptr[-2]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 4)
+                        val += ptr[-4]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 8)
+                        val += ptr[-8]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 16)
+                        val += ptr[-16]; __syncwarp(actMask);
+                        *ptr = val;      __syncwarp(actMask);
+                    #endif
+                    // initially assume that we consume everything
+                    // only last active thread does the writes
+                    if (threadIdx.x == CR_BIN_WARPS - 1)
+                    {
+                        s_batchPos = batchPos + CR_BIN_WARPS * 32;
+                        s_bufCount = bufCount + val;
+                    }
+                }
+                __syncthreads();
+                // skip if no subtriangles
+                if (num)
+                {
+                    // calculate write position for first subtriangle
+                    U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
+                    // only write if entire triangle fits
+                    if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        pos += bufIndex; // adjust for current start position
+                        pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                        if (num == 1)
+                            s_triBuf[pos] = triIdx * 8 + 7; // single triangle
+                        else
+                        {
+                            for (int i=0; i < num; i++)
+                            {
+                                s_triBuf[pos] = triIdx * 8 + i;
+                                pos++;
+                                pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                            }
+                        }
+                    } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        // this triangle is the first that failed, overwrite total count and triangle count
+                        s_batchPos = batchPos + thrInBlock;
+                        s_bufCount = pos;
+                    }
+                }
+                // update triangle counts
+                __syncthreads();
+                batchPos = s_batchPos;
+                bufCount = s_bufCount;
+            }
+            // make every warp clear its output buffers
+            for (int i=threadIdx.x; i < p.numBins; i += 32)
+                s_outMask[threadIdx.y][i] = 0;
+            __syncwarp();
+            // choose our triangle
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (thrInBlock < bufCount)
+            {
+                U32 triPos = bufIndex + thrInBlock;
+                triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                // find triangle
+                int triIdx = s_triBuf[triPos];
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+                // read triangle
+                triData = *(((const uint4*)triHeader) + dataIdx);
+            }
+            // setup bounding box and edge functions, and rasterize
+            S32 lox, loy, hix, hiy;
+            bool hasTri = (thrInBlock < bufCount);
+            U32 hasTriMask = __ballot_sync(~0u, hasTri);
+            if (hasTri)
+            {
+                S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp  * (CR_SUBPIXEL_SIZE >> 1));
+                S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+                int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+                lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+                hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+                U32 bit = 1 << threadIdx.x;
+#if __CUDA_ARCH__ >= 700
+                bool multi = (hix != lox || hiy != loy);
+                if (!__any_sync(hasTriMask, multi))
+                {
+                    int binIdx = lox + p.widthBins * loy;
+                    U32 mask = __match_any_sync(hasTriMask, binIdx);
+                    s_outMask[threadIdx.y][binIdx] = mask;
+                    __syncwarp(hasTriMask);
+                } else
+#endif
+                {
+                    bool complex = (hix > lox+1 || hiy > loy+1);
+                    if (!__any_sync(hasTriMask, complex))
+                    {
+                        int binIdx = lox + p.widthBins * loy;
+                        atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
+                        if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
+                        if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
+                        if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
+                    } else
+                    {
+                        S32 d12x = d02x - d01x, d12y = d02y - d01y;
+                        v0x -= lox << binLog, v0y -= loy << binLog;
+                        S32 t01 = v0x * d01y - v0y * d01x;
+                        S32 t02 = v0y * d02x - v0x * d02y;
+                        S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                        S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
+                        S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
+                        S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
+                        int width = hix - lox + 1;
+                        d01x += width * d01y;
+                        d02x += width * d02y;
+                        d12x += width * d12y;
+                        U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
+                        U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
+                        U8* endPtr  = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
+                        int stride  = p.widthBins * 4;
+                        int ptrYInc = stride - width * 4;
+                        do
+                        {
+                            if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                atomicOr((U32*)currPtr, bit);
+                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                            if (currPtr == skipPtr)
+                                currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
+                        }
+                        while (currPtr != endPtr);
+                    }
+                }
+            }
+            // count per-bin contributions
+            if (thrInBlock == 0)
+                s_overTotal = 0; // overflow counter
+            // ensure that out masks are done
+            __syncthreads();
+            int overIndex = -1;
+            bool act = (thrInBlock < p.numBins);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
+                U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
+                int total = 0;
+                for (int i = 0; i < CR_BIN_WARPS; i++)
+                {
+                    total += __popc(*(U32*)srcPtr);
+                    *(U32*)dstPtr = total;
+                    srcPtr += (CR_MAXBINS_SQR + 1) * 4;
+                    dstPtr += (CR_MAXBINS_SQR + 1) * 4;
+                }
+                // overflow => request a new segment
+                int ofs = s_outOfs[thrInBlock];
+                bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
+                U32 ovrMask = __ballot_sync(actMask, ovr);
+                if (ovr)
+                {
+                    overIndex = __popc(ovrMask & getLaneMaskLt());
+                    if (overIndex == 0)
+                        s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
+                    __syncwarp(ovrMask);
+                    overIndex += s_broadcast[threadIdx.y + 16];
+                    s_overIndex[thrInBlock] = overIndex;
+                }
+            }
+            // sync after overTotal is ready
+            __syncthreads();
+            // at least one segment overflowed => allocate segments
+            U32 overTotal = s_overTotal;
+            U32 allocBase = 0;
+            if (overTotal > 0)
+            {
+                // allocate memory
+                if (thrInBlock == 0)
+                {
+                    U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
+                    s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
+                }
+                __syncthreads();
+                allocBase = s_allocBase;
+                // did my bin overflow?
+                if (overIndex != -1)
+                {
+                    // calculate new segment index
+                    int segIdx = allocBase + overIndex;
+                    // add to linked list
+                    if (s_outOfs[thrInBlock] < 0)
+                        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
+                    else
+                        binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
+                    // defaults
+                    binSegNext [segIdx] = -1;
+                    binSegCount[segIdx] = CR_BIN_SEG_SIZE;
+                }
+            }
+            // concurrent emission -- each warp handles its own triangle
+            if (thrInBlock < bufCount)
+            {
+                int triPos  = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
+                int currBin = lox + loy * p.widthBins;
+                int skipBin = (hix + 1) + loy * p.widthBins;
+                int endBin  = lox + (hiy + 1) * p.widthBins;
+                int binYInc = p.widthBins - (hix - lox + 1);
+                // loop over triangle's bins
+                do
+                {
+                    U32 outMask = s_outMask[threadIdx.y][currBin];
+                    if (outMask & (1<<threadIdx.x))
+                    {
+                        int idx = __popc(outMask & getLaneMaskLt());
+                        if (threadIdx.y > 0)
+                            idx += s_outCount[threadIdx.y-1][currBin];
+                        int base = s_outOfs[currBin];
+                        int free = (-base) & (CR_BIN_SEG_SIZE - 1);
+                        if (idx >= free)
+                            idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
+                        else
+                            idx += base;
+                        binSegData[idx] = s_triBuf[triPos];
+                    }
+                    currBin++;
+                    if (currBin == skipBin)
+                        currBin += binYInc, skipBin += p.widthBins;
+                }
+                while (currBin != endBin);
+            }
+            // wait all triangles to finish, then replace overflown segment offsets
+            __syncthreads();
+            if (thrInBlock < p.numBins)
+            {
+                U32 total  = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
+                U32 oldOfs = s_outOfs[thrInBlock];
+                if (overIndex == -1)
+                    s_outOfs[thrInBlock] = oldOfs + total;
+                else
+                {
+                    int addr = oldOfs + total;
+                    addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
+                    addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
+                    s_outOfs[thrInBlock] = addr;
+                }
+                s_outTotal[thrInBlock] += total;
+            }
+            // these triangles are now done
+            int count = ::min(bufCount, CR_BIN_WARPS * 32);
+            bufCount -= count;
+            bufIndex += count;
+            bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
+        }
+        while (bufCount > 0 || batchPos < batchEnd);
+        // flush all bins
+        if (thrInBlock < p.numBins)
+        {
+            int ofs = s_outOfs[thrInBlock];
+            if (ofs & (CR_BIN_SEG_SIZE-1))
+            {
+                int seg = ofs >> CR_BIN_SEG_LOG2;
+                binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
+                s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
+            }
+        }
+    }
+    // output totals
+    if (thrInBlock < p.numBins)
+        binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp ADDED Viewed

	@@ -0,0 +1,94 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include "../../framework.h"
+#include "Buffer.hpp"
+using namespace CR;
+//------------------------------------------------------------------------
+// GPU buffer.
+//------------------------------------------------------------------------
+Buffer::Buffer(void)
+:   m_gpuPtr(NULL),
+    m_bytes (0)
+{
+    // empty
+}
+Buffer::~Buffer(void)
+{
+    if (m_gpuPtr)
+        cudaFree(m_gpuPtr); // Don't throw an exception.
+}
+void Buffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+    if (m_gpuPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
+        m_gpuPtr = NULL;
+    }
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
+    m_bytes = bytes;
+}
+void Buffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+//------------------------------------------------------------------------
+// Host buffer with page-locked memory.
+//------------------------------------------------------------------------
+HostBuffer::HostBuffer(void)
+:   m_hostPtr(NULL),
+    m_bytes  (0)
+{
+    // empty
+}
+HostBuffer::~HostBuffer(void)
+{
+    if (m_hostPtr)
+        cudaFreeHost(m_hostPtr); // Don't throw an exception.
+}
+void HostBuffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+    if (m_hostPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
+        m_hostPtr = NULL;
+    }
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
+    m_bytes = bytes;
+}
+void HostBuffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp ADDED Viewed

	@@ -0,0 +1,55 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+#include "Defs.hpp"
+namespace CR
+{
+//------------------------------------------------------------------------
+class Buffer
+{
+public:
+                    Buffer      (void);
+                    ~Buffer     (void);
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
+    size_t          getSize     (void) const { return m_bytes; }
+    void            setPtr      (void* ptr) { m_gpuPtr = ptr; }
+private:
+    void*           m_gpuPtr;
+    size_t          m_bytes;
+};
+//------------------------------------------------------------------------
+class HostBuffer
+{
+public:
+                    HostBuffer  (void);
+                    ~HostBuffer (void);
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (void) { return m_hostPtr; }
+    size_t          getSize     (void) const { return m_bytes; }
+    void            setPtr      (void* ptr) { m_hostPtr = ptr; }
+private:
+    void*           m_hostPtr;
+    size_t          m_bytes;
+};
+//------------------------------------------------------------------------
+}

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl ADDED Viewed

	@@ -0,0 +1,730 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//------------------------------------------------------------------------
+__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
+{
+    int tileX = tileInBin & (CR_BIN_SIZE - 1);
+    int tileY = tileInBin >> CR_BIN_LOG2;
+    return tileX + tileY * widthTiles;
+}
+//------------------------------------------------------------------------
+__device__ __inline__ void coarseRasterImpl(const CRParams p)
+{
+    // Common.
+    __shared__ volatile U32 s_workCounter;
+    __shared__ volatile U32 s_scanTemp          [CR_COARSE_WARPS][48];              // 3KB
+    // Input.
+    __shared__ volatile U32 s_binOrder          [CR_MAXBINS_SQR];                   // 1KB
+    __shared__ volatile S32 s_binStreamCurrSeg  [CR_BIN_STREAMS_SIZE];              // 0KB
+    __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE];              // 0KB
+    __shared__ volatile S32 s_triQueue          [CR_COARSE_QUEUE_SIZE];             // 4KB
+    __shared__ volatile S32 s_triQueueWritePos;
+    __shared__ volatile U32 s_binStreamSelectedOfs;
+    __shared__ volatile U32 s_binStreamSelectedSize;
+    // Output.
+    __shared__ volatile U32 s_warpEmitMask      [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
+    __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
+    __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
+    __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
+    __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR];                       // 1KB
+    __shared__ volatile U32 s_firstAllocSeg;
+    __shared__ volatile U32 s_firstActiveIdx;
+    // Pointers and constants.
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+    const S32*              binFirstSeg     = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    const S32*              binTotal        = (const S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    const S32*              binSegData      = (const S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    const S32*              binSegNext      = (const S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    const S32*              binSegCount     = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+    S32*                    activeTiles     = (S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    S32*                    tileFirstSeg    = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+    S32*                    tileSegData     = (S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    S32*                    tileSegNext     = (S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    S32*                    tileSegCount    = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+    int tileLog     = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+    int thrInBlock  = threadIdx.x + threadIdx.y * 32;
+    int emitShift   = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
+        return;
+    // Initialize sharedmem arrays.
+    if (thrInBlock == 0)
+    {
+        s_tileEmitPrefixSum[0] = 0;
+        s_tileAllocPrefixSum[0] = 0;
+    }
+    s_scanTemp[threadIdx.y][threadIdx.x] = 0;
+    // Sort bins in descending order of triangle count.
+    for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
+    {
+        int count = 0;
+        for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
+            count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
+        s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
+    }
+    __syncthreads();
+    sortShared(s_binOrder, p.numBins);
+    // Process each bin by one block.
+    for (;;)
+    {
+        // Pick a bin for the block.
+        if (thrInBlock == 0)
+            s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
+        __syncthreads();
+        int workCounter = s_workCounter;
+        if (workCounter >= p.numBins)
+            break;
+        U32 binOrder = s_binOrder[workCounter];
+        bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
+        if (binEmpty && !p.deferredClear)
+            break;
+        int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
+        // Initialize input/output streams.
+        int triQueueWritePos = 0;
+        int triQueueReadPos = 0;
+        if (thrInBlock < CR_BIN_STREAMS_SIZE)
+        {
+            int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
+            s_binStreamCurrSeg[thrInBlock] = segIdx;
+            s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
+        }
+        for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
+        // Initialize per-bin state.
+        int binY = idiv_fast(binIdx, p.widthBins);
+        int binX = binIdx - binY * p.widthBins;
+        int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+        int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+        int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+        int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+        int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
+        // Entire block: Merge input streams and process triangles.
+        if (!binEmpty)
+        do
+        {
+            //------------------------------------------------------------------------
+            // Merge.
+            //------------------------------------------------------------------------
+            // Entire block: Not enough triangles => merge and queue segments.
+            // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
+            while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
+            {
+                // First warp: Choose the segment with the lowest initial triangle index.
+                bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
+                U32 hasStreamMask = __ballot_sync(~0u, hasStream);
+                if (hasStream)
+                {
+                    // Find the stream with the lowest triangle index.
+                    U32 firstTri = s_binStreamFirstTri[thrInBlock];
+                    U32 t = firstTri;
+                    volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+                    #if (CR_BIN_STREAMS_SIZE > 1)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 2)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 4)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 8)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 16)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
+                    #endif
+                    v[0] = t; __syncwarp(hasStreamMask);
+                    // Consume and broadcast.
+                    bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
+                    U32 firstMask = __ballot_sync(hasStreamMask, first);
+                    if (first && (firstMask >> threadIdx.x) == 1u)
+                    {
+                        int segIdx = s_binStreamCurrSeg[thrInBlock];
+                        s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
+                        if (segIdx != -1)
+                        {
+                            int segSize = binSegCount[segIdx];
+                            int segNext = binSegNext[segIdx];
+                            s_binStreamSelectedSize = segSize;
+                            s_triQueueWritePos = triQueueWritePos + segSize;
+                            s_binStreamCurrSeg[thrInBlock] = segNext;
+                            s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
+                        }
+                    }
+                }
+                // No more segments => break.
+                __syncthreads();
+                triQueueWritePos = s_triQueueWritePos;
+                int segOfs = s_binStreamSelectedOfs;
+                if (segOfs < 0)
+                    break;
+                int segSize = s_binStreamSelectedSize;
+                __syncthreads();
+                // Fetch triangles into the queue.
+                for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
+                {
+                    S32 triIdx = binSegData[segOfs + idxInSeg];
+                    s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
+                }
+            }
+            // All threads: Clear emit masks.
+            for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
+                s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
+            __syncthreads();
+            //------------------------------------------------------------------------
+            // Raster.
+            //------------------------------------------------------------------------
+            // Triangle per thread: Read from the queue.
+            int triIdx = -1;
+            if (triQueueReadPos + thrInBlock < triQueueWritePos)
+                triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (triIdx != -1)
+            {
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+                triData = *((uint4*)triHeader + dataIdx);
+            }
+            // 32 triangles per warp: Record emits (= tile intersections).
+            if (__any_sync(~0u, triIdx != -1))
+            {
+                S32 v0x = sub_s16lo_s16lo(triData.x, originX);
+                S32 v0y = sub_s16hi_s16lo(triData.x, originY);
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+                // Compute tile-based AABB.
+                int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+                int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+                int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+                int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+                int sizex = add_sub(hix, 1, lox);
+                int sizey = add_sub(hiy, 1, loy);
+                int area = sizex * sizey;
+                // Miscellaneous init.
+                U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
+                int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
+                U32 maskBit = 1 << threadIdx.x;
+                // Case A: All AABBs are small => record the full AABB using atomics.
+                if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
+                {
+                    if (triIdx != -1)
+                    {
+                        atomicOr((U32*)currPtr, maskBit);
+                        if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
+                        if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
+                        if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
+                    }
+                }
+                else
+                {
+                    // Compute warp-AABB (scan-32).
+                    U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
+                    if (triIdx == -1)
+                        aabbMask = 0;
+                    volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
+                    U32 maskX = aabbMask & 0xFFFF;
+                    U32 maskY = aabbMask >> 16;
+                    int wlox = findLeadingOne(maskX ^ (maskX - 1));
+                    int wloy = findLeadingOne(maskY ^ (maskY - 1));
+                    int whix = findLeadingOne(maskX);
+                    int whiy = findLeadingOne(maskY);
+                    int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
+                    // Initialize edge functions.
+                    S32 d12x = d02x - d01x;
+                    S32 d12y = d02y - d01y;
+                    v0x -= lox << tileLog;
+                    v0y -= loy << tileLog;
+                    S32 t01 = v0x * d01y - v0y * d01x;
+                    S32 t02 = v0y * d02x - v0x * d02y;
+                    S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                    S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
+                    S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
+                    S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
+                    d01x += sizex * d01y;
+                    d02x += sizex * d02y;
+                    d12x += sizex * d12y;
+                    // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
+                    if (__any_sync(~0u, warea * 4 <= area * 8))
+                    {
+                        // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
+                        bool act = (triIdx != -1);
+                        U32 actMask = __ballot_sync(~0u, act);
+                        if (act)
+                        {
+                            for (int y = wloy; y <= whiy; y++)
+                            {
+                                bool yIn = (y >= loy && y <= hiy);
+                                U32 yMask = __ballot_sync(actMask, yIn);
+                                if (yIn)
+                                {
+                                    for (int x = wlox; x <= whix; x++)
+                                    {
+                                        bool xyIn = (x >= lox && x <= hix);
+                                        U32 xyMask = __ballot_sync(yMask, xyIn);
+                                        if (xyIn)
+                                        {
+                                            U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
+                                            if (threadIdx.x == 31 - __clz(xyMask))
+                                                *(U32*)currPtr = res;
+                                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                                        }
+                                    }
+                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
+                                }
+                            }
+                        }
+                    }
+                    // Case C: General case => Check tiles in AABB, record using atomics.
+                    else
+                    {
+                        if (triIdx != -1)
+                        {
+                            U8* skipPtr = currPtr + (sizex << 2);
+                            U8* endPtr  = currPtr + (sizey << (CR_BIN_LOG2 + 2));
+                            do
+                            {
+                                if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                    atomicOr((U32*)currPtr, maskBit);
+                                currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                                if (currPtr == skipPtr)
+                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
+                            }
+                            while (currPtr != endPtr);
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+            //------------------------------------------------------------------------
+            // Count.
+            //------------------------------------------------------------------------
+            // Tile per thread: Initialize prefix sums.
+            for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+            {
+                int tileInBin = tileInBin_base + thrInBlock;
+                bool act = (tileInBin < CR_BIN_SQR);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (act)
+                {
+                    // Compute prefix sum of emits over warps.
+                    U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
+                    U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
+                    int tileEmits = 0;
+                    for (int i = 0; i < CR_COARSE_WARPS; i++)
+                    {
+                        tileEmits += __popc(*(U32*)srcPtr);
+                        *(U32*)dstPtr = tileEmits;
+                        srcPtr += (CR_BIN_SQR + 1) * 4;
+                        dstPtr += (CR_BIN_SQR + 1) * 4;
+                    }
+                    // Determine the number of segments to allocate.
+                    int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
+                    int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
+                    volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
+                    // All counters within the warp are small => compute prefix sum using ballot.
+                    if (!__any_sync(actMask, tileEmits >= 2))
+                    {
+                        U32 m = getLaneMaskLe();
+                        *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
+                    }
+                    // Otherwise => scan-32 within the warp.
+                    else
+                    {
+                        U32 sum = (tileEmits << emitShift) | tileAllocs;
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1)  sum += v[-1]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2)  sum += v[-2]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4)  sum += v[-4]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8)  sum += v[-8]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask);
+                    }
+                }
+            }
+            // First warp: Scan-8.
+            __syncthreads();
+            bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+            U32 scan8Mask = __ballot_sync(~0u, scan8);
+            if (scan8)
+            {
+                int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
+                volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+                v[0] = sum; __syncwarp(scan8Mask);
+                #if (CR_BIN_SQR > 1 * 32)
+                    sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+                #if (CR_BIN_SQR > 2 * 32)
+                    sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+                #if (CR_BIN_SQR > 4 * 32)
+                    sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+            }
+            __syncthreads();
+            // Tile per thread: Finalize prefix sums.
+            // Single thread: Allocate segments.
+            for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            {
+                int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
+                int numEmits = sum >> emitShift;
+                int numAllocs = sum & ((1 << emitShift) - 1);
+                s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
+                s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
+                if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
+                {
+                    int t = atomicAdd(&atomics.numTileSegs, numAllocs);
+                    s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
+                }
+            }
+            __syncthreads();
+            int firstAllocSeg   = s_firstAllocSeg;
+            int totalEmits      = s_tileEmitPrefixSum[CR_BIN_SQR];
+            int totalAllocs     = s_tileAllocPrefixSum[CR_BIN_SQR];
+            //------------------------------------------------------------------------
+            // Emit.
+            //------------------------------------------------------------------------
+            // Emit per thread: Write triangle index to globalmem.
+            for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
+            {
+                // Find tile in bin.
+                U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
+                U8* tilePtr = tileBase;
+                U8* ptr;
+                #if (CR_BIN_SQR > 128)
+                    ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 64)
+                    ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 32)
+                    ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 16)
+                    ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 8)
+                    ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 4)
+                    ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 2)
+                    ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 1)
+                    ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                int tileInBin = (tilePtr - tileBase) >> 2;
+                int emitInTile = emitInBin - *(U32*)tilePtr;
+                // Find warp in tile.
+                int warpStep = (CR_BIN_SQR + 1) * 4;
+                U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
+                U8* warpPtr = warpBase;
+                #if (CR_COARSE_WARPS > 8)
+                    ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 4)
+                    ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 2)
+                    ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 1)
+                    ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
+                U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
+                int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
+                // Find thread in warp.
+                int threadInWarp = 0;
+                int pop = __popc(emitMask & 0xFFFF);
+                bool pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x10;
+                if (pred) threadInWarp += 0x10;
+                pop = __popc(emitMask & 0xFF);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x08;
+                if (pred) threadInWarp += 0x08;
+                pop = __popc(emitMask & 0xF);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x04;
+                if (pred) threadInWarp += 0x04;
+                pop = __popc(emitMask & 0x3);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x02;
+                if (pred) threadInWarp += 0x02;
+                if (emitInWarp >= (emitMask & 1))
+                    threadInWarp++;
+                // Figure out where to write.
+                int currOfs = s_tileStreamCurrOfs[tileInBin];
+                int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
+                int outOfs = emitInTile;
+                if (outOfs < spaceLeft)
+                    outOfs += currOfs;
+                else
+                {
+                    int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
+                    outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
+                }
+                // Write.
+                int queueIdx = warpInTile * 32 + threadInWarp;
+                int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
+                tileSegData[outOfs] = triIdx;
+            }
+            //------------------------------------------------------------------------
+            // Patch.
+            //------------------------------------------------------------------------
+            // Allocated segment per thread: Initialize next-pointer and count.
+            for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
+            {
+                int segIdx = firstAllocSeg + i;
+                tileSegNext[segIdx] = segIdx + 1;
+                tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
+            }
+            // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
+            __syncthreads();
+            for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            {
+                int oldOfs = s_tileStreamCurrOfs[tileInBin];
+                int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
+                int allocLo = s_tileAllocPrefixSum[tileInBin];
+                int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
+                if (allocLo != allocHi)
+                {
+                    S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
+                    if (oldOfs < 0)
+                        nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
+                    *nextPtr = firstAllocSeg + allocLo;
+                    newOfs--;
+                    newOfs &= CR_TILE_SEG_SIZE - 1;
+                    newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
+                    newOfs++;
+                }
+                s_tileStreamCurrOfs[tileInBin] = newOfs;
+            }
+            // Advance queue read pointer.
+            // Queue became empty => bin done.
+            triQueueReadPos += CR_COARSE_WARPS * 32;
+        }
+        while (triQueueReadPos < triQueueWritePos);
+        // Tile per thread: Fix next-pointer and count of the last segment.
+        // 32 tiles per warp: Count active tiles.
+        __syncthreads();
+        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+        {
+            int tileInBin = tileInBin_base + thrInBlock;
+            bool act = (tileInBin < CR_BIN_SQR);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                int tileX = tileInBin & (CR_BIN_SIZE - 1);
+                int tileY = tileInBin >> CR_BIN_LOG2;
+                bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
+                int ofs = s_tileStreamCurrOfs[tileInBin];
+                int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
+                int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
+                if (ofs >= 0)
+                    tileSegNext[segIdx] = -1;
+                else if (force)
+                {
+                    s_tileStreamCurrOfs[tileInBin] = 0;
+                    tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
+                }
+                if (segCount != 0)
+                    tileSegCount[segIdx] = segCount;
+                U32 res = __ballot_sync(actMask, ofs >= 0 | force);
+                if (threadIdx.x == 0)
+                    s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
+            }
+        }
+        // First warp: Scan-8.
+        // One thread: Allocate space for active tiles.
+        __syncthreads();
+        bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+        U32 scan8Mask = __ballot_sync(~0u, scan8);
+        if (scan8)
+        {
+            volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+            U32 sum = v[0];
+            #if (CR_BIN_SQR > 1 * 32)
+                sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            #if (CR_BIN_SQR > 2 * 32)
+                sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            #if (CR_BIN_SQR > 4 * 32)
+                sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            if (thrInBlock == CR_BIN_SQR / 32 - 1)
+                s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
+        }
+        // Tile per thread: Output active tiles.
+        __syncthreads();
+        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+        {
+            int tileInBin = tileInBin_base + thrInBlock;
+            bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                int activeIdx = s_firstActiveIdx;
+                activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
+                activeIdx += __popc(actMask & getLaneMaskLt());
+                activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
+            }
+        }
+    }
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Constants.hpp ADDED Viewed

	@@ -0,0 +1,73 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+//------------------------------------------------------------------------
+#define CR_MAXVIEWPORT_LOG2     11      // ViewportSize / PixelSize.
+#define CR_SUBPIXEL_LOG2        4       // PixelSize / SubpixelSize.
+#define CR_MAXBINS_LOG2         4       // ViewportSize / BinSize.
+#define CR_BIN_LOG2             4       // BinSize / TileSize.
+#define CR_TILE_LOG2            3       // TileSize / PixelSize.
+#define CR_COVER8X8_LUT_SIZE    768     // 64-bit entries.
+#define CR_FLIPBIT_FLIP_Y       2
+#define CR_FLIPBIT_FLIP_X       3
+#define CR_FLIPBIT_SWAP_XY      4
+#define CR_FLIPBIT_COMPL        5
+#define CR_BIN_STREAMS_LOG2     4
+#define CR_BIN_SEG_LOG2         9       // 32-bit entries.
+#define CR_TILE_SEG_LOG2        5       // 32-bit entries.
+#define CR_MAXSUBTRIS_LOG2      24      // Triangle structs. Dictated by CoarseRaster.
+#define CR_COARSE_QUEUE_LOG2    10      // Triangles.
+#define CR_SETUP_WARPS          2
+#define CR_SETUP_OPT_BLOCKS     8
+#define CR_BIN_WARPS            16
+#define CR_COARSE_WARPS         16      // Must be a power of two.
+#define CR_FINE_MAX_WARPS       20
+#define CR_EMBED_IMAGE_PARAMS   32      // Number of per-image parameter structs embedded in kernel launch parameter block.
+//------------------------------------------------------------------------
+#define CR_MAXVIEWPORT_SIZE     (1 << CR_MAXVIEWPORT_LOG2)
+#define CR_SUBPIXEL_SIZE        (1 << CR_SUBPIXEL_LOG2)
+#define CR_SUBPIXEL_SQR         (1 << (CR_SUBPIXEL_LOG2 * 2))
+#define CR_MAXBINS_SIZE         (1 << CR_MAXBINS_LOG2)
+#define CR_MAXBINS_SQR          (1 << (CR_MAXBINS_LOG2 * 2))
+#define CR_BIN_SIZE             (1 << CR_BIN_LOG2)
+#define CR_BIN_SQR              (1 << (CR_BIN_LOG2 * 2))
+#define CR_MAXTILES_LOG2        (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
+#define CR_MAXTILES_SIZE        (1 << CR_MAXTILES_LOG2)
+#define CR_MAXTILES_SQR         (1 << (CR_MAXTILES_LOG2 * 2))
+#define CR_TILE_SIZE            (1 << CR_TILE_LOG2)
+#define CR_TILE_SQR             (1 << (CR_TILE_LOG2 * 2))
+#define CR_BIN_STREAMS_SIZE     (1 << CR_BIN_STREAMS_LOG2)
+#define CR_BIN_SEG_SIZE         (1 << CR_BIN_SEG_LOG2)
+#define CR_TILE_SEG_SIZE        (1 << CR_TILE_SEG_LOG2)
+#define CR_MAXSUBTRIS_SIZE      (1 << CR_MAXSUBTRIS_LOG2)
+#define CR_COARSE_QUEUE_SIZE    (1 << CR_COARSE_QUEUE_LOG2)
+//------------------------------------------------------------------------
+// When evaluating interpolated Z pixel centers, we may introduce an error
+// of (+-CR_LERP_ERROR) ULPs.
+#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
+#define CR_DEPTH_MIN                CR_LERP_ERROR(3)
+#define CR_DEPTH_MAX                (CR_U32_MAX - CR_LERP_ERROR(3))
+//------------------------------------------------------------------------

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CudaRaster.cpp ADDED Viewed

	@@ -0,0 +1,79 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include "Defs.hpp"
+#include "../CudaRaster.hpp"
+#include "RasterImpl.hpp"
+using namespace CR;
+//------------------------------------------------------------------------
+// Stub interface implementation.
+//------------------------------------------------------------------------
+CudaRaster::CudaRaster()
+{
+    m_impl = new RasterImpl();
+}
+CudaRaster::~CudaRaster()
+{
+    delete m_impl;
+}
+void CudaRaster::setBufferSize(int width, int height, int numImages)
+{
+    m_impl->setBufferSize(Vec3i(width, height, numImages));
+}
+void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
+{
+    m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
+}
+void CudaRaster::setRenderModeFlags(U32 flags)
+{
+    m_impl->setRenderModeFlags(flags);
+}
+void CudaRaster::deferredClear(U32 clearColor)
+{
+    m_impl->deferredClear(clearColor);
+}
+void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
+{
+    m_impl->setVertexBuffer(vertices, numVertices);
+}
+void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
+{
+    m_impl->setIndexBuffer(indices, numTriangles);
+}
+bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
+{
+    return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
+}
+void* CudaRaster::getColorBuffer(void)
+{
+    return m_impl->getColorBuffer();
+}
+void* CudaRaster::getDepthBuffer(void)
+{
+    return m_impl->getDepthBuffer();
+}
+void CudaRaster::swapDepthAndPeel(void)
+{
+    m_impl->swapDepthAndPeel();
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Defs.hpp ADDED Viewed

	@@ -0,0 +1,90 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+#include <cuda_runtime.h>
+#include <cstdint>
+namespace CR
+{
+//------------------------------------------------------------------------
+#ifndef NULL
+#   define NULL 0
+#endif
+#ifdef __CUDACC__
+#   define CR_CUDA 1
+#else
+#   define CR_CUDA 0
+#endif
+#if CR_CUDA
+#   define CR_CUDA_FUNC     __device__ __inline__
+#   define CR_CUDA_CONST    __constant__
+#else
+#   define CR_CUDA_FUNC     inline
+#   define CR_CUDA_CONST    static const
+#endif
+#define CR_UNREF(X)         ((void)(X))
+#define CR_ARRAY_SIZE(X)    ((int)(sizeof(X) / sizeof((X)[0])))
+//------------------------------------------------------------------------
+typedef uint8_t             U8;
+typedef uint16_t            U16;
+typedef uint32_t            U32;
+typedef uint64_t            U64;
+typedef int8_t              S8;
+typedef int16_t             S16;
+typedef int32_t             S32;
+typedef int64_t             S64;
+typedef float               F32;
+typedef double              F64;
+typedef void                (*FuncPtr)(void);
+//------------------------------------------------------------------------
+#define CR_U32_MAX          (0xFFFFFFFFu)
+#define CR_S32_MIN          (~0x7FFFFFFF)
+#define CR_S32_MAX          (0x7FFFFFFF)
+#define CR_U64_MAX          ((U64)(S64)-1)
+#define CR_S64_MIN          ((S64)-1 << 63)
+#define CR_S64_MAX          (~((S64)-1 << 63))
+#define CR_F32_MIN          (1.175494351e-38f)
+#define CR_F32_MAX          (3.402823466e+38f)
+#define CR_F64_MIN          (2.2250738585072014e-308)
+#define CR_F64_MAX          (1.7976931348623158e+308)
+//------------------------------------------------------------------------
+// Misc types.
+class Vec2i
+{
+public:
+    Vec2i(int x_, int y_) : x(x_), y(y_) {}
+    int x, y;
+};
+class Vec3i
+{
+public:
+    Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+    int x, y, z;
+};
+//------------------------------------------------------------------------
+// CUDA utilities.
+#if CR_CUDA
+#   define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
+#endif
+//------------------------------------------------------------------------
+} // namespace CR

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_FineRaster.inl ADDED Viewed

	@@ -0,0 +1,385 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//------------------------------------------------------------------------
+// Utility funcs.
+//------------------------------------------------------------------------
+__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
+{
+    tileZMax = CR_DEPTH_MAX;
+    tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
+}
+__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
+{
+    // Entry is warp-coherent.
+    if (__any_sync(~0u, tileZUpd))
+    {
+        U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
+        temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        tileZMax = temp[47];
+        tileZUpd = false;
+    }
+}
+//------------------------------------------------------------------------
+__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
+{
+    const CRTriangleHeader* triHeaderPtr    = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
+    const S32*              tileSegData     = (const S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    const S32*              tileSegNext     = (const S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    const S32*              tileSegCount    = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+    if (threadIdx.x >= tileSegCount[segment])
+    {
+        triIdx = -1;
+        dataIdx = -1;
+    }
+    else
+    {
+        int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
+        triIdx = subtriIdx >> 3;
+        dataIdx = triIdx;
+        subtriIdx &= 7;
+        if (subtriIdx != 7)
+            dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
+        triHeader = *((uint4*)triHeaderPtr + dataIdx);
+    }
+    // advance to next segment
+    segment = tileSegNext[segment];
+}
+//------------------------------------------------------------------------
+__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
+{
+    U32 zmin = triHeader.w & 0xFFFFF000;
+    return (zmin > tileZMax);
+}
+//------------------------------------------------------------------------
+__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
+{
+    int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp  - 1) << (CR_SUBPIXEL_LOG2 - 1));
+    int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
+    // extract S16 vertex positions while subtracting tile coordinates
+    S32 v0x  = sub_s16lo_s16lo(triHeader.x, baseX);
+    S32 v0y  = sub_s16hi_s16lo(triHeader.x, baseY);
+    S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
+    S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
+    S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
+    S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
+    // extract flipbits
+    U32 f01 = (triHeader.w >> 6) & 0x3C;
+    U32 f12 = (triHeader.w >> 2) & 0x3C;
+    U32 f20 = (triHeader.w << 2) & 0x3C;
+    // compute per-edge coverage masks
+    U64 c01, c12, c20;
+    c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
+    c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
+    c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
+    // combine masks
+    return c01 & c12 & c20;
+}
+//------------------------------------------------------------------------
+__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
+{
+    __syncwarp();
+    temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    return value;
+}
+__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
+{
+    return temp[47];
+}
+//------------------------------------------------------------------------
+__device__ __inline__ S32 findBit(U64 mask, int idx)
+{
+    U32 x = getLo(mask);
+    int  pop = __popc(x);
+    bool p   = (pop <= idx);
+    if (p) x = getHi(mask);
+    if (p) idx -= pop;
+    int bit = p ? 32 : 0;
+    pop = __popc(x & 0x0000ffffu);
+    p   = (pop <= idx);
+    if (p) x >>= 16;
+    if (p) bit += 16;
+    if (p) idx -= pop;
+    U32 tmp = x & 0x000000ffu;
+    pop = __popc(tmp);
+    p   = (pop <= idx);
+    if (p) tmp = x & 0x0000ff00u;
+    if (p) idx -= pop;
+    return findLeadingOne(tmp) + bit - idx;
+}
+//------------------------------------------------------------------------
+// Single-sample implementation.
+//------------------------------------------------------------------------
+__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
+{
+    atomicMin((U32*)pDepth, depth);
+    __syncwarp(ropMask);
+    bool act = (depth == *pDepth);
+    __syncwarp(ropMask);
+    U32 actMask = __ballot_sync(ropMask, act);
+    if (act)
+    {
+        *pDepth = 0;
+        __syncwarp(actMask);
+        atomicMax((U32*)pDepth, threadIdx.x);
+        __syncwarp(actMask);
+        if (*pDepth == threadIdx.x)
+        {
+            *pDepth = depth;
+            *pColor = color;
+        }
+        __syncwarp(actMask);
+    }
+}
+//------------------------------------------------------------------------
+__device__ __inline__ void fineRasterImpl(const CRParams p)
+{
+                                                                            // for 20 warps:
+    __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE];           // 6KB
+    __shared__ volatile U32 s_tileColor   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tileDepth   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tilePeel    [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_triDataIdx  [CR_FINE_MAX_WARPS][64];          // 5KB  CRTriangleData index
+    __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64];          // 10KB coverage mask
+    __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64];          // 5KB  fragment index
+    __shared__ volatile U32 s_temp        [CR_FINE_MAX_WARPS][80];          // 6.25KB
+                                                                            // = 47.25KB total
+    CRAtomics&            atomics   = p.atomics[blockIdx.z];
+    const CRTriangleData* triData   = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
+    const S32*      activeTiles     = (const S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    const S32*      tileFirstSeg    = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+    volatile U32*   tileColor       = s_tileColor[threadIdx.y];
+    volatile U32*   tileDepth       = s_tileDepth[threadIdx.y];
+    volatile U32*   tilePeel        = s_tilePeel[threadIdx.y];
+    volatile U32*   triDataIdx      = s_triDataIdx[threadIdx.y];
+    volatile U64*   triangleCov     = s_triangleCov[threadIdx.y];
+    volatile U32*   triangleFrag    = s_triangleFrag[threadIdx.y];
+    volatile U32*   temp            = s_temp[threadIdx.y];
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
+        return;
+    temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
+    cover8x8_setupLUT(s_cover8x8_lut);
+    __syncthreads();
+    // loop over tiles
+    for (;;)
+    {
+        // pick a tile
+        if (threadIdx.x == 0)
+            temp[16] = atomicAdd(&atomics.fineCounter, 1);
+        __syncwarp();
+        int activeIdx = temp[16];
+        if (activeIdx >= atomics.numActiveTiles)
+            break;
+        int tileIdx = activeTiles[activeIdx];
+        S32 segment = tileFirstSeg[tileIdx];
+        int tileY = tileIdx / p.widthTiles;
+        int tileX = tileIdx - tileY * p.widthTiles;
+        int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+        int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+        // initialize per-tile state
+        int triRead = 0, triWrite = 0;
+        int fragRead = 0, fragWrite = 0;
+        if (threadIdx.x == 0)
+            triangleFrag[63] = 0; // "previous triangle"
+        // deferred clear => clear tile
+        if (p.deferredClear)
+        {
+			tileColor[threadIdx.x] = p.clearColor;
+            tileDepth[threadIdx.x] = p.clearDepth;
+            tileColor[threadIdx.x + 32] = p.clearColor;
+            tileDepth[threadIdx.x + 32] = p.clearDepth;
+        }
+        else // otherwise => read tile from framebuffer
+        {
+            U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+			tileColor[threadIdx.x] = pColor[px + p.strideX * py];
+            tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
+            tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
+            tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
+        }
+        // read peeling inputs if enabled
+        if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
+        {
+            U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
+            tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
+            tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
+        }
+        U32 tileZMax;
+        bool tileZUpd;
+        initTileZMax(tileZMax, tileZUpd, tileDepth);
+        // process fragments
+        for(;;)
+        {
+            // need to queue more fragments?
+            if (fragWrite - fragRead < 32 && segment >= 0)
+            {
+                // update tile z - coherent over warp
+                updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
+                // read triangles
+                do
+                {
+                    // read triangle index and data, advance to next segment
+                    S32 triIdx, dataIdx;
+                    uint4 triHeader;
+                    getTriangle(p, triIdx, dataIdx, triHeader, segment);
+                    // early z cull
+                    if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
+                        triIdx = -1;
+                    // determine coverage
+                    U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
+                    S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
+                    // fragment count scan
+                    U32 frag = scan32_value(pop, temp);
+                    frag += fragWrite; // frag now holds cumulative fragment count
+                    fragWrite += scan32_total(temp);
+                    // queue non-empty triangles
+                    U32 goodMask = __ballot_sync(~0u, pop != 0);
+                    if (pop != 0)
+                    {
+                        int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
+                        triDataIdx  [idx] = dataIdx;
+                        triangleFrag[idx] = frag;
+                        triangleCov [idx] = coverage;
+                    }
+                    triWrite += __popc(goodMask);
+                }
+                while (fragWrite - fragRead < 32 && segment >= 0);
+            }
+            __syncwarp();
+            // end of segment?
+            if (fragRead == fragWrite)
+                break;
+            // clear triangle boundaries
+            temp[threadIdx.x + 16] = 0;
+            __syncwarp();
+            // tag triangle boundaries
+            if (triRead + threadIdx.x < triWrite)
+            {
+                int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
+                if (idx <= 32)
+                    temp[idx + 16 - 1] = 1;
+            }
+            __syncwarp();
+            int ropLaneIdx = threadIdx.x;
+            U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
+            // distribute fragments
+            bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
+            U32 fragmentMask = __ballot_sync(~0u, hasFragment);
+            if (hasFragment)
+            {
+                int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
+                int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
+                U64 coverage = triangleCov[triBufIdx];
+                int pixelInTile = findBit(coverage, fragIdx);
+                int dataIdx = triDataIdx[triBufIdx];
+                // determine pixel position
+                U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
+                U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
+                // depth test
+                U32 depth = 0;
+                uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
+                depth = td.x * pixelX + td.y * pixelY + td.z;
+                bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
+                if (!zkill)
+                {
+                    U32 oldDepth = tileDepth[pixelInTile];
+                    if (depth > oldDepth)
+                        zkill = true;
+                    else if (oldDepth == tileZMax)
+                        tileZUpd = true; // we are replacing previous zmax => need to update
+                }
+                U32 ropMask = __ballot_sync(fragmentMask, !zkill);
+                if (!zkill)
+					executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
+            }
+            // no need to sync, as next up is updateTileZMax that does internal warp sync
+            // update counters
+            fragRead = ::min(fragRead + 32, fragWrite);
+            triRead += __popc(boundaryMask);
+        }
+        // Write tile back to the framebuffer.
+        if (true)
+        {
+            int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+            int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+            U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+            pColor[px + p.strideX * py] = tileColor[threadIdx.x];
+            pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
+            pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
+            pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
+        }
+    }
+}
+//------------------------------------------------------------------------

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_PrivateDefs.hpp ADDED Viewed

	@@ -0,0 +1,153 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#pragma once
+#include "Defs.hpp"
+#include "Constants.hpp"
+namespace CR
+{
+//------------------------------------------------------------------------
+// Projected triangle.
+//------------------------------------------------------------------------
+struct CRTriangleHeader
+{
+    S16 v0x;    // Subpixels relative to viewport center. Valid if triSubtris = 1.
+    S16 v0y;
+    S16 v1x;
+    S16 v1y;
+    S16 v2x;
+    S16 v2y;
+    U32 misc;   // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
+};
+//------------------------------------------------------------------------
+struct CRTriangleData
+{
+    U32 zx;     // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
+    U32 zy;
+    U32 zb;
+    U32 id;     // Triangle id.
+};
+//------------------------------------------------------------------------
+// Device-side structures.
+//------------------------------------------------------------------------
+struct CRAtomics
+{
+    // Setup.
+    S32         numSubtris;         // = numTris
+    // Bin.
+    S32         binCounter;         // = 0
+    S32         numBinSegs;         // = 0
+    // Coarse.
+    S32         coarseCounter;      // = 0
+    S32         numTileSegs;        // = 0
+    S32         numActiveTiles;     // = 0
+    // Fine.
+    S32         fineCounter;        // = 0
+};
+//------------------------------------------------------------------------
+struct CRImageParams
+{
+    S32         triOffset;          // First triangle index to draw.
+    S32         triCount;           // Number of triangles to draw.
+    S32         binBatchSize;       // Number of triangles per batch.
+};
+//------------------------------------------------------------------------
+struct CRParams
+{
+    // Common.
+    CRAtomics*  atomics;            // Work counters. Per-image.
+    S32         numImages;          // Batch size.
+    S32         totalCount;         // In range mode, total number of triangles to render.
+    S32         instanceMode;       // 0 = range mode, 1 = instance mode.
+    S32         numVertices;        // Number of vertices in input buffer, not counting multiples in instance mode.
+    S32         numTriangles;       // Number of triangles in input buffer.
+    void*       vertexBuffer;       // numVertices * float4(x, y, z, w)
+    void*       indexBuffer;        // numTriangles * int3(vi0, vi1, vi2)
+    S32         widthPixels;        // Render buffer size in pixels. Must be multiple of tile size (8x8).
+    S32         heightPixels;
+    S32         widthPixelsVp;      // Viewport size in pixels.
+    S32         heightPixelsVp;
+    S32         widthBins;          // widthPixels / CR_BIN_SIZE
+    S32         heightBins;         // heightPixels / CR_BIN_SIZE
+    S32         numBins;            // widthBins * heightBins
+    F32         xs;                 // Vertex position adjustments for tiled rendering.
+    F32         ys;
+    F32         xo;
+    F32         yo;
+    S32         widthTiles;         // widthPixels / CR_TILE_SIZE
+    S32         heightTiles;        // heightPixels / CR_TILE_SIZE
+    S32         numTiles;           // widthTiles * heightTiles
+    U32         renderModeFlags;
+    S32         deferredClear;      // 1 = Clear framebuffer before rendering triangles.
+    U32         clearColor;
+    U32         clearDepth;
+    // These are uniform across batch.
+    S32         maxSubtris;
+    S32         maxBinSegs;
+    S32         maxTileSegs;
+    // Setup output / bin input.
+    void*       triSubtris;         // maxSubtris * U8
+    void*       triHeader;          // maxSubtris * CRTriangleHeader
+    void*       triData;            // maxSubtris * CRTriangleData
+    // Bin output / coarse input.
+    void*       binSegData;         // maxBinSegs * CR_BIN_SEG_SIZE * S32
+    void*       binSegNext;         // maxBinSegs * S32
+    void*       binSegCount;        // maxBinSegs * S32
+    void*       binFirstSeg;        // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
+    void*       binTotal;           // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
+    // Coarse output / fine input.
+    void*       tileSegData;        // maxTileSegs * CR_TILE_SEG_SIZE * S32
+    void*       tileSegNext;        // maxTileSegs * S32
+    void*       tileSegCount;       // maxTileSegs * S32
+    void*       activeTiles;        // CR_MAXTILES_SQR * (S32 tileIdx)
+    void*       tileFirstSeg;       // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
+    // Surface buffers. Outer tile offset is baked into pointers.
+    void*       colorBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       depthBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       peelBuffer;         // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
+    S32         strideX;            // horizontal size in pixels
+    S32         strideY;            // vertical stride in pixels
+    // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
+    CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
+    const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
+};
+//------------------------------------------------------------------------
+}

extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_RasterImpl.cpp ADDED Viewed

	@@ -0,0 +1,370 @@

+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include "../../framework.h"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "RasterImpl.hpp"
+#include <cuda_runtime.h>
+using namespace CR;
+using std::min;
+using std::max;
+//------------------------------------------------------------------------
+// Kernel prototypes and variables.
+void triangleSetupKernel (const CRParams p);
+void binRasterKernel     (const CRParams p);
+void coarseRasterKernel  (const CRParams p);
+void fineRasterKernel    (const CRParams p);
+//------------------------------------------------------------------------
+RasterImpl::RasterImpl(void)
+:   m_renderModeFlags       (0),
+    m_deferredClear         (false),
+    m_clearColor            (0),
+    m_vertexPtr             (NULL),
+    m_indexPtr              (NULL),
+    m_numVertices           (0),
+    m_numTriangles          (0),
+    m_bufferSizesReported   (0),
+    m_numImages             (0),
+    m_bufferSizePixels      (0, 0),
+    m_bufferSizeVp          (0, 0),
+    m_sizePixels            (0, 0),
+    m_sizeVp                (0, 0),
+    m_offsetPixels          (0, 0),
+    m_sizeBins              (0, 0),
+    m_numBins               (0),
+    m_sizeTiles             (0, 0),
+    m_numTiles              (0),
+    m_numSMs                (1),
+    m_numCoarseBlocksPerSM  (1),
+    m_numFineBlocksPerSM    (1),
+    m_numFineWarpsPerBlock  (1),
+    m_maxSubtris            (1),
+    m_maxBinSegs            (1),
+    m_maxTileSegs           (1)
+{
+    // Query relevant device attributes.
+    int currentDevice = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
+    cudaFuncAttributes attr;
+    NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
+    m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
+    // Setup functions.
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel,     cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel,  cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel,    cudaFuncCachePreferShared));
+}
+//------------------------------------------------------------------------
+RasterImpl::~RasterImpl(void)
+{
+    // Empty.
+}
+//------------------------------------------------------------------------
+void RasterImpl::setBufferSize(Vec3i size)
+{
+    // Internal buffer width and height must be divisible by tile size.
+    int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    m_bufferSizePixels = Vec2i(w, h);
+    m_bufferSizeVp     = Vec2i(size.x, size.y);
+    m_numImages        = size.z;
+    m_colorBuffer.reset(w * h * size.z * sizeof(U32));
+    m_depthBuffer.reset(w * h * size.z * sizeof(U32));
+}
+//------------------------------------------------------------------------
+void RasterImpl::setViewport(Vec2i size, Vec2i offset)
+{
+    // Offset must be divisible by tile size.
+    NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
+    // Round internal viewport size to multiples of tile size.
+    int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    m_sizePixels    = Vec2i(w, h);
+    m_offsetPixels  = offset;
+    m_sizeVp        = Vec2i(size.x, size.y);
+    m_sizeTiles.x   = m_sizePixels.x >> CR_TILE_LOG2;
+    m_sizeTiles.y   = m_sizePixels.y >> CR_TILE_LOG2;
+    m_numTiles      = m_sizeTiles.x * m_sizeTiles.y;
+    m_sizeBins.x    = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_sizeBins.y    = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_numBins       = m_sizeBins.x * m_sizeBins.y;
+}
+void RasterImpl::swapDepthAndPeel(void)
+{
+    m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
+    void* tmp = m_depthBuffer.getPtr();
+    m_depthBuffer.setPtr(m_peelBuffer.getPtr());
+    m_peelBuffer.setPtr(tmp);
+}
+//------------------------------------------------------------------------
+bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
+{
+    bool instanceMode = (!ranges);
+    int maxSubtrisSlack     = 4096;     // x 81B    = 324KB
+    int maxBinSegsSlack     = 256;      // x 2137B  = 534KB
+    int maxTileSegsSlack    = 4096;     // x 136B   = 544KB
+    // Resize atomics as needed.
+    m_crAtomics    .grow(m_numImages * sizeof(CRAtomics));
+    m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
+    // Size of these buffers doesn't depend on input.
+    m_binFirstSeg  .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_binTotal     .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_activeTiles  .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+    m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+    // Construct per-image parameters and determine worst-case buffer sizes.
+    m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+    for (int i=0; i < m_numImages; i++)
+    {
+        CRImageParams& ip = imageParams[i];
+        int roundSize  = CR_BIN_WARPS * 32;
+        int minBatches = CR_BIN_STREAMS_SIZE * 2;
+        int maxRounds  = 32;
+        ip.triOffset = instanceMode ? 0 : ranges[i].x;
+        ip.triCount  = instanceMode ? m_numTriangles : ranges[i].y;
+        ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
+        m_maxSubtris  = max(m_maxSubtris,  min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+        m_maxBinSegs  = max(m_maxBinSegs,  max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
+        m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
+    }
+    // Retry until successful.
+    for (;;)
+    {
+        // Allocate buffers.
+        m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
+        m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
+        m_triData   .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
+        m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
+        m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
+        m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
+        m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
+        m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
+        m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
+        // Report if buffers grow from last time.
+        size_t sizesTotal = getTotalBufferSizes();
+        if (sizesTotal > m_bufferSizesReported)
+        {
+            size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
+            sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
+            LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
+            m_bufferSizesReported = sizesMB << 20;
+        }
+        // Launch stages. Blocks until everything is done.
+        launchStages(instanceMode, peel, stream);
+        // Peeling iteration cannot fail, so no point checking things further.
+        if (peel)
+            break;
+        // Atomics after coarse stage are now available.
+        CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+        // Success?
+        bool failed = false;
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
+        }
+        if (!failed)
+            break; // Success!
+        // If we were already at maximum capacity, no can do.
+        if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
+            return false;
+        // Enlarge buffers and try again.
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            m_maxSubtris  = max(m_maxSubtris,  min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+            m_maxBinSegs  = max(m_maxBinSegs,  a.numBinSegs + maxBinSegsSlack);
+            m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
+        }
+    }
+    m_deferredClear = false;
+    return true; // Success.
+}
+//------------------------------------------------------------------------
+size_t RasterImpl::getTotalBufferSizes(void) const
+{
+    return
+        m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
+        m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
+        m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
+        m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
+}
+//------------------------------------------------------------------------
+void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
+{
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+    // Unless peeling, initialize atomics to mostly zero.
+    CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+    if (!peel)
+    {
+        memset(atomics, 0, m_numImages * sizeof(CRAtomics));
+        for (int i=0; i < m_numImages; i++)
+            atomics[i].numSubtris = imageParams[i].triCount;
+    }
+    // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
+    NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
+    // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
+    if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
+    {
+        int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
+        m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
+    }
+    // Set global parameters.
+    CRParams p;
+    {
+        p.atomics           = (CRAtomics*)m_crAtomics.getPtr();
+        p.numImages         = m_numImages;
+        p.totalCount        = 0; // Only relevant in range mode.
+        p.instanceMode      = instanceMode ? 1 : 0;
+        p.numVertices       = m_numVertices;
+        p.numTriangles      = m_numTriangles;
+        p.vertexBuffer      = m_vertexPtr;
+        p.indexBuffer       = m_indexPtr;
+        p.widthPixels       = m_sizePixels.x;
+        p.heightPixels      = m_sizePixels.y;
+        p.widthPixelsVp     = m_sizeVp.x;
+        p.heightPixelsVp    = m_sizeVp.y;
+        p.widthBins         = m_sizeBins.x;
+        p.heightBins        = m_sizeBins.y;
+        p.numBins           = m_numBins;
+        p.xs                = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
+        p.ys                = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
+        p.xo                = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
+        p.yo                = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
+        p.widthTiles        = m_sizeTiles.x;
+        p.heightTiles       = m_sizeTiles.y;
+        p.numTiles          = m_numTiles;
+        p.renderModeFlags   = m_renderModeFlags;
+        p.deferredClear     = m_deferredClear ? 1 : 0;
+        p.clearColor        = m_clearColor;
+        p.clearDepth        = CR_DEPTH_MAX;
+        p.maxSubtris        = m_maxSubtris;
+        p.maxBinSegs        = m_maxBinSegs;
+        p.maxTileSegs       = m_maxTileSegs;
+        p.triSubtris        = m_triSubtris.getPtr();
+        p.triHeader         = m_triHeader.getPtr();
+        p.triData           = m_triData.getPtr();
+        p.binSegData        = m_binSegData.getPtr();
+        p.binSegNext        = m_binSegNext.getPtr();
+        p.binSegCount       = m_binSegCount.getPtr();
+        p.binFirstSeg       = m_binFirstSeg.getPtr();
+        p.binTotal          = m_binTotal.getPtr();
+        p.tileSegData       = m_tileSegData.getPtr();
+        p.tileSegNext       = m_tileSegNext.getPtr();
+        p.tileSegCount      = m_tileSegCount.getPtr();
+        p.activeTiles       = m_activeTiles.getPtr();
+        p.tileFirstSeg      = m_tileFirstSeg.getPtr();
+        size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
+        p.colorBuffer       = m_colorBuffer.getPtr(byteOffset);
+        p.depthBuffer       = m_depthBuffer.getPtr(byteOffset);
+        p.peelBuffer        = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
+        p.strideX           = m_bufferSizePixels.x;
+        p.strideY           = m_bufferSizePixels.y;
+        memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
+        p.imageParamsExtra  = (CRImageParams*)m_crImageParamsExtra.getPtr();
+    }
+    // Setup block sizes.
+    dim3 brBlock(32, CR_BIN_WARPS);
+    dim3 crBlock(32, CR_COARSE_WARPS);
+    dim3 frBlock(32, m_numFineWarpsPerBlock);
+    void* args[] = {&p};
+    // Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
+    if (!peel)
+    {
+        if (instanceMode)
+        {
+            int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        else
+        {
+            for (int i=0; i < m_numImages; i++)
+                p.totalCount += imageParams[i].triCount;
+            int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
+    }
+    // Fine rasterizer is launched always.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
+    NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+}
+//------------------------------------------------------------------------