geyik1 commited on
Commit
111ba01
·
verified ·
1 Parent(s): 207d5db

Upload 174 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. 3d.mp4 +3 -0
  3. README.md +11 -0
  4. app.py +2 -0
  5. assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp +0 -0
  6. assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp +0 -0
  7. assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp +0 -0
  8. assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp +0 -0
  9. assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp +0 -0
  10. assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp +0 -0
  11. assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp +0 -0
  12. assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp +0 -0
  13. assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp +0 -0
  14. assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp +0 -0
  15. assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp +0 -0
  16. assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp +0 -0
  17. assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp +0 -0
  18. assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp +0 -0
  19. assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp +0 -0
  20. assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp +0 -0
  21. assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp +0 -0
  22. assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp +0 -0
  23. assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp +0 -0
  24. assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp +0 -0
  25. assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp +0 -0
  26. assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp +3 -0
  27. assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp +0 -0
  28. assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp +0 -0
  29. assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp +3 -0
  30. assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp +0 -0
  31. assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp +0 -0
  32. assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp +0 -0
  33. assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp +0 -0
  34. assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp +0 -0
  35. extensions/extensions_nvdiffrast_LICENSE.txt +97 -0
  36. extensions/extensions_nvdiffrast_README.md +42 -0
  37. extensions/extensions_nvdiffrast_run_sample.sh +52 -0
  38. extensions/extensions_nvdiffrast_setup copy.py +51 -0
  39. extensions/extensions_nvdiffrast_setup.py +82 -0
  40. extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp +63 -0
  41. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl +423 -0
  42. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp +94 -0
  43. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp +55 -0
  44. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl +730 -0
  45. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Constants.hpp +73 -0
  46. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CudaRaster.cpp +79 -0
  47. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Defs.hpp +90 -0
  48. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_FineRaster.inl +385 -0
  49. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_PrivateDefs.hpp +153 -0
  50. extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_RasterImpl.cpp +370 -0
.gitattributes ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 3d.mp4 filter=lfs diff=lfs merge=lfs -text
2
+ assets/example_image/assets_example_image_image[[:space:]]-[[:space:]]2024-12-09T184251.254.webp filter=lfs diff=lfs merge=lfs -text
3
+ assets/example_image/assets_example_image_image[[:space:]]-[[:space:]]2024-12-09T184511.907.webp filter=lfs diff=lfs merge=lfs -text
4
+ wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
5
+ wheels/wheels_diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
3d.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c3282465210bac76f44b605956139679ed774c8bad9be686707d1b770961371
3
+ size 21309978
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SORA 3D
3
+ emoji: 🏢🏆
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.44.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Create top-quality 3D(.GLB) models from text or images
11
+ ---
app.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import os
2
+ exec(os.environ.get('APP'))
assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp ADDED

Git LFS Details

  • SHA256: 04a741b7588b46f6f885987fa3330d51f671d7f372eedf3cc007e69fd1a2e3e9
  • Pointer size: 131 Bytes
  • Size of remote file: 113 kB
assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp ADDED

Git LFS Details

  • SHA256: f5cbfa61ca24164cafbd695aa6f12b617196a64f913e5c9964fad60a74dedda6
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp ADDED
assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp ADDED
extensions/extensions_nvdiffrast_LICENSE.txt ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
2
+
3
+
4
+ Nvidia Source Code License (1-Way Commercial)
5
+
6
+ =======================================================================
7
+
8
+ 1. Definitions
9
+
10
+ "Licensor" means any person or entity that distributes its Work.
11
+
12
+ "Software" means the original work of authorship made available under
13
+ this License.
14
+
15
+ "Work" means the Software and any additions to or derivative works of
16
+ the Software that are made available under this License.
17
+
18
+ The terms "reproduce," "reproduction," "derivative works," and
19
+ "distribution" have the meaning as provided under U.S. copyright law;
20
+ provided, however, that for the purposes of this License, derivative
21
+ works shall not include works that remain separable from, or merely
22
+ link (or bind by name) to the interfaces of, the Work.
23
+
24
+ Works, including the Software, are "made available" under this License
25
+ by including in or with the Work either (a) a copyright notice
26
+ referencing the applicability of this License to the Work, or (b) a
27
+ copy of this License.
28
+
29
+ 2. License Grants
30
+
31
+ 2.1 Copyright Grant. Subject to the terms and conditions of this
32
+ License, each Licensor grants to you a perpetual, worldwide,
33
+ non-exclusive, royalty-free, copyright license to reproduce,
34
+ prepare derivative works of, publicly display, publicly perform,
35
+ sublicense and distribute its Work and any resulting derivative
36
+ works in any form.
37
+
38
+ 3. Limitations
39
+
40
+ 3.1 Redistribution. You may reproduce or distribute the Work only
41
+ if (a) you do so under this License, (b) you include a complete
42
+ copy of this License with your distribution, and (c) you retain
43
+ without modification any copyright, patent, trademark, or
44
+ attribution notices that are present in the Work.
45
+
46
+ 3.2 Derivative Works. You may specify that additional or different
47
+ terms apply to the use, reproduction, and distribution of your
48
+ derivative works of the Work ("Your Terms") only if (a) Your Terms
49
+ provide that the use limitation in Section 3.3 applies to your
50
+ derivative works, and (b) you identify the specific derivative
51
+ works that are subject to Your Terms. Notwithstanding Your Terms,
52
+ this License (including the redistribution requirements in Section
53
+ 3.1) will continue to apply to the Work itself.
54
+
55
+ 3.3 Use Limitation. The Work and any derivative works thereof only
56
+ may be used or intended for use non-commercially. The Work or
57
+ derivative works thereof may be used or intended for use by Nvidia
58
+ or its affiliates commercially or non-commercially. As used herein,
59
+ "non-commercially" means for research or evaluation purposes only
60
+ and not for any direct or indirect monetary gain.
61
+
62
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim
63
+ against any Licensor (including any claim, cross-claim or
64
+ counterclaim in a lawsuit) to enforce any patents that you allege
65
+ are infringed by any Work, then your rights under this License from
66
+ such Licensor (including the grant in Section 2.1) will terminate
67
+ immediately.
68
+
69
+ 3.5 Trademarks. This License does not grant any rights to use any
70
+ Licensor's or its affiliates' names, logos, or trademarks, except
71
+ as necessary to reproduce the notices described in this License.
72
+
73
+ 3.6 Termination. If you violate any term of this License, then your
74
+ rights under this License (including the grant in Section 2.1) will
75
+ terminate immediately.
76
+
77
+ 4. Disclaimer of Warranty.
78
+
79
+ THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
80
+ KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82
+ NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83
+ THIS LICENSE.
84
+
85
+ 5. Limitation of Liability.
86
+
87
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
88
+ THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
89
+ SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
90
+ INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
91
+ OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
92
+ (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
93
+ LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
94
+ COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
95
+ THE POSSIBILITY OF SUCH DAMAGES.
96
+
97
+ =======================================================================
extensions/extensions_nvdiffrast_README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
2
+
3
+ ![Teaser image](./docs/img/teaser.png)
4
+
5
+ **Modular Primitives for High-Performance Differentiable Rendering**<br>
6
+ Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
7
+ [http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
8
+
9
+ Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
10
+ Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
11
+
12
+ ## Licenses
13
+
14
+ Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
15
+
16
+ This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
17
+
18
+ For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
19
+
20
+ We do not currently accept outside code contributions in the form of pull requests.
21
+
22
+ Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
23
+ [sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
24
+ originally shared under
25
+ [MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
26
+ Mesh and texture stored as part of `samples/data/earth.npz` are derived from
27
+ [3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
28
+ model originally made available under
29
+ [TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
30
+
31
+ ## Citation
32
+
33
+ ```
34
+ @article{Laine2020diffrast,
35
+ title = {Modular Primitives for High-Performance Differentiable Rendering},
36
+ author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
37
+ journal = {ACM Transactions on Graphics},
38
+ year = {2020},
39
+ volume = {39},
40
+ number = {6}
41
+ }
42
+ ```
extensions/extensions_nvdiffrast_run_sample.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
6
+ # and proprietary rights in and to this software, related documentation
7
+ # and any modifications thereto. Any use, reproduction, disclosure or
8
+ # distribution of this software and related documentation without an express
9
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
10
+
11
+ function print_help {
12
+ echo "Usage: `basename $0` [--build-container] <python_file>"
13
+ echo ""
14
+ echo "Option --build-container will build the Docker container based on"
15
+ echo "docker/Dockerfile and tag the image with gltorch:latest."
16
+ echo ""
17
+ echo "Example: `basename $0` samples/torch/envphong.py"
18
+ }
19
+
20
+ build_container=0
21
+ sample=""
22
+ while [[ "$#" -gt 0 ]]; do
23
+ case $1 in
24
+ --build-container) build_container=1;;
25
+ -h|--help) print_help; exit 0 ;;
26
+ --*) echo "Unknown parameter passed: $1"; exit 1 ;;
27
+ *) sample="$1"; shift; break;
28
+ esac
29
+ shift
30
+ done
31
+
32
+ rest=$@
33
+
34
+ # Build the docker container
35
+ if [ "$build_container" = "1" ]; then
36
+ docker build --tag gltorch:latest -f docker/Dockerfile .
37
+ fi
38
+
39
+ if [ ! -f "$sample" ]; then
40
+ echo
41
+ echo "No python sample given or file '$sample' not found. Exiting."
42
+ exit 1
43
+ fi
44
+
45
+ image="gltorch:latest"
46
+
47
+ echo "Using container image: $image"
48
+ echo "Running command: $sample $rest"
49
+
50
+ # Run a sample with docker
51
+ docker run --rm -it --gpus all --user $(id -u):$(id -g) \
52
+ -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest
extensions/extensions_nvdiffrast_setup copy.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ # and proprietary rights in and to this software, related documentation
5
+ # and any modifications thereto. Any use, reproduction, disclosure or
6
+ # distribution of this software and related documentation without an express
7
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ import nvdiffrast
10
+ import setuptools
11
+ import os
12
+
13
+ with open("README.md", "r") as fh:
14
+ long_description = fh.read()
15
+
16
+ setuptools.setup(
17
+ name="nvdiffrast",
18
+ version=nvdiffrast.__version__,
19
+ author="Samuli Laine",
20
+ author_email="slaine@nvidia.com",
21
+ description="nvdiffrast - modular primitives for high-performance differentiable rendering",
22
+ long_description=long_description,
23
+ long_description_content_type="text/markdown",
24
+ url="https://github.com/NVlabs/nvdiffrast",
25
+ packages=setuptools.find_packages(),
26
+ package_data={
27
+ 'nvdiffrast': [
28
+ 'common/*.h',
29
+ 'common/*.inl',
30
+ 'common/*.cu',
31
+ 'common/*.cpp',
32
+ 'common/cudaraster/*.hpp',
33
+ 'common/cudaraster/impl/*.cpp',
34
+ 'common/cudaraster/impl/*.hpp',
35
+ 'common/cudaraster/impl/*.inl',
36
+ 'common/cudaraster/impl/*.cu',
37
+ 'lib/*.h',
38
+ 'torch/*.h',
39
+ 'torch/*.inl',
40
+ 'torch/*.cpp',
41
+ 'tensorflow/*.cu',
42
+ ] + (['lib/*.lib'] if os.name == 'nt' else [])
43
+ },
44
+ include_package_data=True,
45
+ install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
46
+ classifiers=[
47
+ "Programming Language :: Python :: 3",
48
+ "Operating System :: OS Independent",
49
+ ],
50
+ python_requires='>=3.6',
51
+ )
extensions/extensions_nvdiffrast_setup.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ # and proprietary rights in and to this software, related documentation
5
+ # and any modifications thereto. Any use, reproduction, disclosure or
6
+ # distribution of this software and related documentation without an express
7
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ import nvdiffrast
10
+ import setuptools
11
+ import os
12
+ from torch.utils.cpp_extension import CUDAExtension, BuildExtension
13
+
14
+
15
+ with open("README.md", "r") as fh:
16
+ long_description = fh.read()
17
+
18
+ setuptools.setup(
19
+ name="nvdiffrast",
20
+ version=nvdiffrast.__version__,
21
+ author="Samuli Laine",
22
+ author_email="slaine@nvidia.com",
23
+ description="nvdiffrast - modular primitives for high-performance differentiable rendering",
24
+ long_description=long_description,
25
+ long_description_content_type="text/markdown",
26
+ url="https://github.com/NVlabs/nvdiffrast",
27
+ packages=setuptools.find_packages(),
28
+ # package_data={
29
+ # 'nvdiffrast': [
30
+ # 'common/*.h',
31
+ # 'common/*.inl',
32
+ # 'common/*.cu',
33
+ # 'common/*.cpp',
34
+ # 'common/cudaraster/*.hpp',
35
+ # 'common/cudaraster/impl/*.cpp',
36
+ # 'common/cudaraster/impl/*.hpp',
37
+ # 'common/cudaraster/impl/*.inl',
38
+ # 'common/cudaraster/impl/*.cu',
39
+ # 'lib/*.h',
40
+ # 'torch/*.h',
41
+ # 'torch/*.inl',
42
+ # 'torch/*.cpp',
43
+ # 'tensorflow/*.cu',
44
+ # ] + (['lib/*.lib'] if os.name == 'nt' else [])
45
+ # },
46
+ # include_package_data=True,
47
+ ext_modules=[
48
+ CUDAExtension(
49
+ name="nvdiffrast.torch._C",
50
+ sources=[
51
+ 'nvdiffrast/common/cudaraster/impl/Buffer.cpp',
52
+ 'nvdiffrast/common/cudaraster/impl/CudaRaster.cpp',
53
+ 'nvdiffrast/common/cudaraster/impl/RasterImpl_.cu',
54
+ 'nvdiffrast/common/cudaraster/impl/RasterImpl.cpp',
55
+ 'nvdiffrast/common/common.cpp',
56
+ 'nvdiffrast/common/rasterize.cu',
57
+ 'nvdiffrast/common/interpolate.cu',
58
+ 'nvdiffrast/common/texture_.cu',
59
+ 'nvdiffrast/common/texture.cpp',
60
+ 'nvdiffrast/common/antialias.cu',
61
+ 'nvdiffrast/torch/torch_bindings.cpp',
62
+ 'nvdiffrast/torch/torch_rasterize.cpp',
63
+ 'nvdiffrast/torch/torch_interpolate.cpp',
64
+ 'nvdiffrast/torch/torch_texture.cpp',
65
+ 'nvdiffrast/torch/torch_antialias.cpp',
66
+ ],
67
+ extra_compile_args={
68
+ 'cxx': ['-DNVDR_TORCH'],
69
+ 'nvcc': ['-DNVDR_TORCH', '-lineinfo'],
70
+ },
71
+ )
72
+ ],
73
+ cmdclass={
74
+ 'build_ext': BuildExtension
75
+ },
76
+ install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
77
+ classifiers=[
78
+ "Programming Language :: Python :: 3",
79
+ "Operating System :: OS Independent",
80
+ ],
81
+ python_requires='>=3.6',
82
+ )
extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+
11
+ //------------------------------------------------------------------------
12
+ // This is a slimmed-down and modernized version of the original
13
+ // CudaRaster codebase that accompanied the HPG 2011 paper
14
+ // "High-Performance Software Rasterization on GPUs" by Laine and Karras.
15
+ // Modifications have been made to accommodate post-Volta execution model
16
+ // with warp divergence. Support for shading, blending, quad rendering,
17
+ // and supersampling have been removed as unnecessary for nvdiffrast.
18
+ //------------------------------------------------------------------------
19
+
20
+ namespace CR
21
+ {
22
+
23
+ class RasterImpl;
24
+
25
+ //------------------------------------------------------------------------
26
+ // Interface class to isolate user from implementation details.
27
+ //------------------------------------------------------------------------
28
+
29
+ class CudaRaster
30
+ {
31
+ public:
32
+ enum
33
+ {
34
+ RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
35
+ RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
36
+ };
37
+
38
+ public:
39
+ CudaRaster (void);
40
+ ~CudaRaster (void);
41
+
42
+ void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
43
+ void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
44
+ void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
45
+ void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
46
+ void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
47
+ void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
48
+ bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
49
+ void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
50
+ void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
51
+ void swapDepthAndPeel (void); // Swap depth and peeling buffers.
52
+
53
+ private:
54
+ CudaRaster (const CudaRaster&); // forbidden
55
+ CudaRaster& operator= (const CudaRaster&); // forbidden
56
+
57
+ private:
58
+ RasterImpl* m_impl; // Opaque pointer to implementation.
59
+ };
60
+
61
+ //------------------------------------------------------------------------
62
+ } // namespace CR
63
+
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ //------------------------------------------------------------------------
10
+
11
+ __device__ __inline__ void binRasterImpl(const CRParams p)
12
+ {
13
+ __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
14
+ __shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
15
+ __shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
16
+ __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
17
+ __shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
18
+ __shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
19
+ __shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
20
+ __shared__ volatile U32 s_batchPos;
21
+ __shared__ volatile U32 s_bufCount;
22
+ __shared__ volatile U32 s_overTotal;
23
+ __shared__ volatile U32 s_allocBase;
24
+
25
+ const CRImageParams& ip = getImageParams(p, blockIdx.z);
26
+ CRAtomics& atomics = p.atomics[blockIdx.z];
27
+ const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
28
+ const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
29
+
30
+ S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
31
+ S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
32
+ S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
33
+ S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
34
+ S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
35
+
36
+ if (atomics.numSubtris > p.maxSubtris)
37
+ return;
38
+
39
+ // per-thread state
40
+ int thrInBlock = threadIdx.x + threadIdx.y * 32;
41
+ int batchPos = 0;
42
+
43
+ // first 16 elements of s_broadcast are always zero
44
+ if (thrInBlock < 16)
45
+ s_broadcast[thrInBlock] = 0;
46
+
47
+ // initialize output linked lists and offsets
48
+ if (thrInBlock < p.numBins)
49
+ {
50
+ binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
51
+ s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
52
+ s_outTotal[thrInBlock] = 0;
53
+ }
54
+
55
+ // repeat until done
56
+ for(;;)
57
+ {
58
+ // get batch
59
+ if (thrInBlock == 0)
60
+ s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
61
+ __syncthreads();
62
+ batchPos = s_batchPos;
63
+
64
+ // all batches done?
65
+ if (batchPos >= ip.triCount)
66
+ break;
67
+
68
+ // per-thread state
69
+ int bufIndex = 0;
70
+ int bufCount = 0;
71
+ int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
72
+
73
+ // loop over batch as long as we have triangles in it
74
+ do
75
+ {
76
+ // read more triangles
77
+ while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
78
+ {
79
+ // get subtriangle count
80
+
81
+ int triIdx = batchPos + thrInBlock;
82
+ int num = 0;
83
+ if (triIdx < batchEnd)
84
+ num = triSubtris[triIdx];
85
+
86
+ // cumulative sum of subtriangles within each warp
87
+ U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
88
+ if (__any_sync(~0u, num > 1))
89
+ {
90
+ myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
91
+ myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
92
+ }
93
+ if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
94
+ s_broadcast[threadIdx.y + 16] = myIdx + num;
95
+ __syncthreads();
96
+
97
+ // cumulative sum of per-warp subtriangle counts
98
+ // Note: cannot have more than 32 warps or this needs to sync between each step.
99
+ bool act = (thrInBlock < CR_BIN_WARPS);
100
+ U32 actMask = __ballot_sync(~0u, act);
101
+ if (threadIdx.y == 0 && act)
102
+ {
103
+ volatile U32* ptr = &s_broadcast[thrInBlock + 16];
104
+ U32 val = *ptr;
105
+ #if (CR_BIN_WARPS > 1)
106
+ val += ptr[-1]; __syncwarp(actMask);
107
+ *ptr = val; __syncwarp(actMask);
108
+ #endif
109
+ #if (CR_BIN_WARPS > 2)
110
+ val += ptr[-2]; __syncwarp(actMask);
111
+ *ptr = val; __syncwarp(actMask);
112
+ #endif
113
+ #if (CR_BIN_WARPS > 4)
114
+ val += ptr[-4]; __syncwarp(actMask);
115
+ *ptr = val; __syncwarp(actMask);
116
+ #endif
117
+ #if (CR_BIN_WARPS > 8)
118
+ val += ptr[-8]; __syncwarp(actMask);
119
+ *ptr = val; __syncwarp(actMask);
120
+ #endif
121
+ #if (CR_BIN_WARPS > 16)
122
+ val += ptr[-16]; __syncwarp(actMask);
123
+ *ptr = val; __syncwarp(actMask);
124
+ #endif
125
+
126
+ // initially assume that we consume everything
127
+ // only last active thread does the writes
128
+ if (threadIdx.x == CR_BIN_WARPS - 1)
129
+ {
130
+ s_batchPos = batchPos + CR_BIN_WARPS * 32;
131
+ s_bufCount = bufCount + val;
132
+ }
133
+ }
134
+ __syncthreads();
135
+
136
+ // skip if no subtriangles
137
+ if (num)
138
+ {
139
+ // calculate write position for first subtriangle
140
+ U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
141
+
142
+ // only write if entire triangle fits
143
+ if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
144
+ {
145
+ pos += bufIndex; // adjust for current start position
146
+ pos &= CR_ARRAY_SIZE(s_triBuf)-1;
147
+ if (num == 1)
148
+ s_triBuf[pos] = triIdx * 8 + 7; // single triangle
149
+ else
150
+ {
151
+ for (int i=0; i < num; i++)
152
+ {
153
+ s_triBuf[pos] = triIdx * 8 + i;
154
+ pos++;
155
+ pos &= CR_ARRAY_SIZE(s_triBuf)-1;
156
+ }
157
+ }
158
+ } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
159
+ {
160
+ // this triangle is the first that failed, overwrite total count and triangle count
161
+ s_batchPos = batchPos + thrInBlock;
162
+ s_bufCount = pos;
163
+ }
164
+ }
165
+
166
+ // update triangle counts
167
+ __syncthreads();
168
+ batchPos = s_batchPos;
169
+ bufCount = s_bufCount;
170
+ }
171
+
172
+ // make every warp clear its output buffers
173
+ for (int i=threadIdx.x; i < p.numBins; i += 32)
174
+ s_outMask[threadIdx.y][i] = 0;
175
+ __syncwarp();
176
+
177
+ // choose our triangle
178
+ uint4 triData = make_uint4(0, 0, 0, 0);
179
+ if (thrInBlock < bufCount)
180
+ {
181
+ U32 triPos = bufIndex + thrInBlock;
182
+ triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
183
+
184
+ // find triangle
185
+ int triIdx = s_triBuf[triPos];
186
+ int dataIdx = triIdx >> 3;
187
+ int subtriIdx = triIdx & 7;
188
+ if (subtriIdx != 7)
189
+ dataIdx = triHeader[dataIdx].misc + subtriIdx;
190
+
191
+ // read triangle
192
+
193
+ triData = *(((const uint4*)triHeader) + dataIdx);
194
+ }
195
+
196
+ // setup bounding box and edge functions, and rasterize
197
+ S32 lox, loy, hix, hiy;
198
+ bool hasTri = (thrInBlock < bufCount);
199
+ U32 hasTriMask = __ballot_sync(~0u, hasTri);
200
+ if (hasTri)
201
+ {
202
+ S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
203
+ S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
204
+ S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
205
+ S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
206
+ S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
207
+ S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
208
+ int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
209
+ lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
210
+ loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
211
+ hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
212
+ hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
213
+
214
+ U32 bit = 1 << threadIdx.x;
215
+ #if __CUDA_ARCH__ >= 700
216
+ bool multi = (hix != lox || hiy != loy);
217
+ if (!__any_sync(hasTriMask, multi))
218
+ {
219
+ int binIdx = lox + p.widthBins * loy;
220
+ U32 mask = __match_any_sync(hasTriMask, binIdx);
221
+ s_outMask[threadIdx.y][binIdx] = mask;
222
+ __syncwarp(hasTriMask);
223
+ } else
224
+ #endif
225
+ {
226
+ bool complex = (hix > lox+1 || hiy > loy+1);
227
+ if (!__any_sync(hasTriMask, complex))
228
+ {
229
+ int binIdx = lox + p.widthBins * loy;
230
+ atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
231
+ if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
232
+ if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
233
+ if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
234
+ } else
235
+ {
236
+ S32 d12x = d02x - d01x, d12y = d02y - d01y;
237
+ v0x -= lox << binLog, v0y -= loy << binLog;
238
+
239
+ S32 t01 = v0x * d01y - v0y * d01x;
240
+ S32 t02 = v0y * d02x - v0x * d02y;
241
+ S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
242
+ S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
243
+ S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
244
+ S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
245
+
246
+ int width = hix - lox + 1;
247
+ d01x += width * d01y;
248
+ d02x += width * d02y;
249
+ d12x += width * d12y;
250
+
251
+ U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
252
+ U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
253
+ U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
254
+ int stride = p.widthBins * 4;
255
+ int ptrYInc = stride - width * 4;
256
+
257
+ do
258
+ {
259
+ if (b01 >= 0 && b02 >= 0 && b12 >= 0)
260
+ atomicOr((U32*)currPtr, bit);
261
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
262
+ if (currPtr == skipPtr)
263
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
264
+ }
265
+ while (currPtr != endPtr);
266
+ }
267
+ }
268
+ }
269
+
270
+ // count per-bin contributions
271
+ if (thrInBlock == 0)
272
+ s_overTotal = 0; // overflow counter
273
+
274
+ // ensure that out masks are done
275
+ __syncthreads();
276
+
277
+ int overIndex = -1;
278
+ bool act = (thrInBlock < p.numBins);
279
+ U32 actMask = __ballot_sync(~0u, act);
280
+ if (act)
281
+ {
282
+ U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
283
+ U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
284
+ int total = 0;
285
+ for (int i = 0; i < CR_BIN_WARPS; i++)
286
+ {
287
+ total += __popc(*(U32*)srcPtr);
288
+ *(U32*)dstPtr = total;
289
+ srcPtr += (CR_MAXBINS_SQR + 1) * 4;
290
+ dstPtr += (CR_MAXBINS_SQR + 1) * 4;
291
+ }
292
+
293
+ // overflow => request a new segment
294
+ int ofs = s_outOfs[thrInBlock];
295
+ bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
296
+ U32 ovrMask = __ballot_sync(actMask, ovr);
297
+ if (ovr)
298
+ {
299
+ overIndex = __popc(ovrMask & getLaneMaskLt());
300
+ if (overIndex == 0)
301
+ s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
302
+ __syncwarp(ovrMask);
303
+ overIndex += s_broadcast[threadIdx.y + 16];
304
+ s_overIndex[thrInBlock] = overIndex;
305
+ }
306
+ }
307
+
308
+ // sync after overTotal is ready
309
+ __syncthreads();
310
+
311
+ // at least one segment overflowed => allocate segments
312
+ U32 overTotal = s_overTotal;
313
+ U32 allocBase = 0;
314
+ if (overTotal > 0)
315
+ {
316
+ // allocate memory
317
+ if (thrInBlock == 0)
318
+ {
319
+ U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
320
+ s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
321
+ }
322
+ __syncthreads();
323
+ allocBase = s_allocBase;
324
+
325
+ // did my bin overflow?
326
+ if (overIndex != -1)
327
+ {
328
+ // calculate new segment index
329
+ int segIdx = allocBase + overIndex;
330
+
331
+ // add to linked list
332
+ if (s_outOfs[thrInBlock] < 0)
333
+ binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
334
+ else
335
+ binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
336
+
337
+ // defaults
338
+ binSegNext [segIdx] = -1;
339
+ binSegCount[segIdx] = CR_BIN_SEG_SIZE;
340
+ }
341
+ }
342
+
343
+ // concurrent emission -- each warp handles its own triangle
344
+ if (thrInBlock < bufCount)
345
+ {
346
+ int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
347
+ int currBin = lox + loy * p.widthBins;
348
+ int skipBin = (hix + 1) + loy * p.widthBins;
349
+ int endBin = lox + (hiy + 1) * p.widthBins;
350
+ int binYInc = p.widthBins - (hix - lox + 1);
351
+
352
+ // loop over triangle's bins
353
+ do
354
+ {
355
+ U32 outMask = s_outMask[threadIdx.y][currBin];
356
+ if (outMask & (1<<threadIdx.x))
357
+ {
358
+ int idx = __popc(outMask & getLaneMaskLt());
359
+ if (threadIdx.y > 0)
360
+ idx += s_outCount[threadIdx.y-1][currBin];
361
+
362
+ int base = s_outOfs[currBin];
363
+ int free = (-base) & (CR_BIN_SEG_SIZE - 1);
364
+ if (idx >= free)
365
+ idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
366
+ else
367
+ idx += base;
368
+
369
+ binSegData[idx] = s_triBuf[triPos];
370
+ }
371
+
372
+ currBin++;
373
+ if (currBin == skipBin)
374
+ currBin += binYInc, skipBin += p.widthBins;
375
+ }
376
+ while (currBin != endBin);
377
+ }
378
+
379
+ // wait all triangles to finish, then replace overflown segment offsets
380
+ __syncthreads();
381
+ if (thrInBlock < p.numBins)
382
+ {
383
+ U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
384
+ U32 oldOfs = s_outOfs[thrInBlock];
385
+ if (overIndex == -1)
386
+ s_outOfs[thrInBlock] = oldOfs + total;
387
+ else
388
+ {
389
+ int addr = oldOfs + total;
390
+ addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
391
+ addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
392
+ s_outOfs[thrInBlock] = addr;
393
+ }
394
+ s_outTotal[thrInBlock] += total;
395
+ }
396
+
397
+ // these triangles are now done
398
+ int count = ::min(bufCount, CR_BIN_WARPS * 32);
399
+ bufCount -= count;
400
+ bufIndex += count;
401
+ bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
402
+ }
403
+ while (bufCount > 0 || batchPos < batchEnd);
404
+
405
+ // flush all bins
406
+ if (thrInBlock < p.numBins)
407
+ {
408
+ int ofs = s_outOfs[thrInBlock];
409
+ if (ofs & (CR_BIN_SEG_SIZE-1))
410
+ {
411
+ int seg = ofs >> CR_BIN_SEG_LOG2;
412
+ binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
413
+ s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
414
+ }
415
+ }
416
+ }
417
+
418
+ // output totals
419
+ if (thrInBlock < p.numBins)
420
+ binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
421
+ }
422
+
423
+ //------------------------------------------------------------------------
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include "../../framework.h"
10
+ #include "Buffer.hpp"
11
+
12
+ using namespace CR;
13
+
14
+ //------------------------------------------------------------------------
15
+ // GPU buffer.
16
+ //------------------------------------------------------------------------
17
+
18
+ Buffer::Buffer(void)
19
+ : m_gpuPtr(NULL),
20
+ m_bytes (0)
21
+ {
22
+ // empty
23
+ }
24
+
25
+ Buffer::~Buffer(void)
26
+ {
27
+ if (m_gpuPtr)
28
+ cudaFree(m_gpuPtr); // Don't throw an exception.
29
+ }
30
+
31
+ void Buffer::reset(size_t bytes)
32
+ {
33
+ if (bytes == m_bytes)
34
+ return;
35
+
36
+ if (m_gpuPtr)
37
+ {
38
+ NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
39
+ m_gpuPtr = NULL;
40
+ }
41
+
42
+ if (bytes > 0)
43
+ NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
44
+
45
+ m_bytes = bytes;
46
+ }
47
+
48
+ void Buffer::grow(size_t bytes)
49
+ {
50
+ if (bytes > m_bytes)
51
+ reset(bytes);
52
+ }
53
+
54
+ //------------------------------------------------------------------------
55
+ // Host buffer with page-locked memory.
56
+ //------------------------------------------------------------------------
57
+
58
+ HostBuffer::HostBuffer(void)
59
+ : m_hostPtr(NULL),
60
+ m_bytes (0)
61
+ {
62
+ // empty
63
+ }
64
+
65
+ HostBuffer::~HostBuffer(void)
66
+ {
67
+ if (m_hostPtr)
68
+ cudaFreeHost(m_hostPtr); // Don't throw an exception.
69
+ }
70
+
71
+ void HostBuffer::reset(size_t bytes)
72
+ {
73
+ if (bytes == m_bytes)
74
+ return;
75
+
76
+ if (m_hostPtr)
77
+ {
78
+ NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
79
+ m_hostPtr = NULL;
80
+ }
81
+
82
+ if (bytes > 0)
83
+ NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
84
+
85
+ m_bytes = bytes;
86
+ }
87
+
88
+ void HostBuffer::grow(size_t bytes)
89
+ {
90
+ if (bytes > m_bytes)
91
+ reset(bytes);
92
+ }
93
+
94
+ //------------------------------------------------------------------------
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include "Defs.hpp"
11
+
12
+ namespace CR
13
+ {
14
+ //------------------------------------------------------------------------
15
+
16
+ class Buffer
17
+ {
18
+ public:
19
+ Buffer (void);
20
+ ~Buffer (void);
21
+
22
+ void reset (size_t bytes);
23
+ void grow (size_t bytes);
24
+ void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
25
+ size_t getSize (void) const { return m_bytes; }
26
+
27
+ void setPtr (void* ptr) { m_gpuPtr = ptr; }
28
+
29
+ private:
30
+ void* m_gpuPtr;
31
+ size_t m_bytes;
32
+ };
33
+
34
+ //------------------------------------------------------------------------
35
+
36
+ class HostBuffer
37
+ {
38
+ public:
39
+ HostBuffer (void);
40
+ ~HostBuffer (void);
41
+
42
+ void reset (size_t bytes);
43
+ void grow (size_t bytes);
44
+ void* getPtr (void) { return m_hostPtr; }
45
+ size_t getSize (void) const { return m_bytes; }
46
+
47
+ void setPtr (void* ptr) { m_hostPtr = ptr; }
48
+
49
+ private:
50
+ void* m_hostPtr;
51
+ size_t m_bytes;
52
+ };
53
+
54
+ //------------------------------------------------------------------------
55
+ }
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ //------------------------------------------------------------------------
10
+
11
+ __device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
12
+ {
13
+ int tileX = tileInBin & (CR_BIN_SIZE - 1);
14
+ int tileY = tileInBin >> CR_BIN_LOG2;
15
+ return tileX + tileY * widthTiles;
16
+ }
17
+
18
+ //------------------------------------------------------------------------
19
+
20
+ __device__ __inline__ void coarseRasterImpl(const CRParams p)
21
+ {
22
+ // Common.
23
+
24
+ __shared__ volatile U32 s_workCounter;
25
+ __shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
26
+
27
+ // Input.
28
+
29
+ __shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
30
+ __shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
31
+ __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
32
+ __shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
33
+ __shared__ volatile S32 s_triQueueWritePos;
34
+ __shared__ volatile U32 s_binStreamSelectedOfs;
35
+ __shared__ volatile U32 s_binStreamSelectedSize;
36
+
37
+ // Output.
38
+
39
+ __shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
40
+ __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
41
+ __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
42
+ __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
43
+ __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
44
+ __shared__ volatile U32 s_firstAllocSeg;
45
+ __shared__ volatile U32 s_firstActiveIdx;
46
+
47
+ // Pointers and constants.
48
+
49
+ CRAtomics& atomics = p.atomics[blockIdx.z];
50
+ const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
51
+ const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
52
+ const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
53
+ const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
54
+ const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
55
+ const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
56
+ S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
57
+ S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
58
+ S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
59
+ S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
60
+ S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
61
+
62
+ int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
63
+ int thrInBlock = threadIdx.x + threadIdx.y * 32;
64
+ int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
65
+
66
+ if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
67
+ return;
68
+
69
+ // Initialize sharedmem arrays.
70
+
71
+ if (thrInBlock == 0)
72
+ {
73
+ s_tileEmitPrefixSum[0] = 0;
74
+ s_tileAllocPrefixSum[0] = 0;
75
+ }
76
+ s_scanTemp[threadIdx.y][threadIdx.x] = 0;
77
+
78
+ // Sort bins in descending order of triangle count.
79
+
80
+ for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
81
+ {
82
+ int count = 0;
83
+ for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
84
+ count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
85
+ s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
86
+ }
87
+
88
+ __syncthreads();
89
+ sortShared(s_binOrder, p.numBins);
90
+
91
+ // Process each bin by one block.
92
+
93
+ for (;;)
94
+ {
95
+ // Pick a bin for the block.
96
+
97
+ if (thrInBlock == 0)
98
+ s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
99
+ __syncthreads();
100
+
101
+ int workCounter = s_workCounter;
102
+ if (workCounter >= p.numBins)
103
+ break;
104
+
105
+ U32 binOrder = s_binOrder[workCounter];
106
+ bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
107
+ if (binEmpty && !p.deferredClear)
108
+ break;
109
+
110
+ int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
111
+
112
+ // Initialize input/output streams.
113
+
114
+ int triQueueWritePos = 0;
115
+ int triQueueReadPos = 0;
116
+
117
+ if (thrInBlock < CR_BIN_STREAMS_SIZE)
118
+ {
119
+ int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
120
+ s_binStreamCurrSeg[thrInBlock] = segIdx;
121
+ s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
122
+ }
123
+
124
+ for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
125
+ s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
126
+
127
+ // Initialize per-bin state.
128
+
129
+ int binY = idiv_fast(binIdx, p.widthBins);
130
+ int binX = binIdx - binY * p.widthBins;
131
+ int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
132
+ int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
133
+ int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
134
+ int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
135
+ int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
136
+
137
+ // Entire block: Merge input streams and process triangles.
138
+
139
+ if (!binEmpty)
140
+ do
141
+ {
142
+ //------------------------------------------------------------------------
143
+ // Merge.
144
+ //------------------------------------------------------------------------
145
+
146
+ // Entire block: Not enough triangles => merge and queue segments.
147
+ // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
148
+
149
+ while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
150
+ {
151
+ // First warp: Choose the segment with the lowest initial triangle index.
152
+
153
+ bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
154
+ U32 hasStreamMask = __ballot_sync(~0u, hasStream);
155
+ if (hasStream)
156
+ {
157
+ // Find the stream with the lowest triangle index.
158
+
159
+ U32 firstTri = s_binStreamFirstTri[thrInBlock];
160
+ U32 t = firstTri;
161
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
162
+
163
+ #if (CR_BIN_STREAMS_SIZE > 1)
164
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
165
+ #endif
166
+ #if (CR_BIN_STREAMS_SIZE > 2)
167
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
168
+ #endif
169
+ #if (CR_BIN_STREAMS_SIZE > 4)
170
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
171
+ #endif
172
+ #if (CR_BIN_STREAMS_SIZE > 8)
173
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
174
+ #endif
175
+ #if (CR_BIN_STREAMS_SIZE > 16)
176
+ v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
177
+ #endif
178
+ v[0] = t; __syncwarp(hasStreamMask);
179
+
180
+ // Consume and broadcast.
181
+
182
+ bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
183
+ U32 firstMask = __ballot_sync(hasStreamMask, first);
184
+ if (first && (firstMask >> threadIdx.x) == 1u)
185
+ {
186
+ int segIdx = s_binStreamCurrSeg[thrInBlock];
187
+ s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
188
+ if (segIdx != -1)
189
+ {
190
+ int segSize = binSegCount[segIdx];
191
+ int segNext = binSegNext[segIdx];
192
+ s_binStreamSelectedSize = segSize;
193
+ s_triQueueWritePos = triQueueWritePos + segSize;
194
+ s_binStreamCurrSeg[thrInBlock] = segNext;
195
+ s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
196
+ }
197
+ }
198
+ }
199
+
200
+ // No more segments => break.
201
+
202
+ __syncthreads();
203
+ triQueueWritePos = s_triQueueWritePos;
204
+ int segOfs = s_binStreamSelectedOfs;
205
+ if (segOfs < 0)
206
+ break;
207
+
208
+ int segSize = s_binStreamSelectedSize;
209
+ __syncthreads();
210
+
211
+ // Fetch triangles into the queue.
212
+
213
+ for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
214
+ {
215
+ S32 triIdx = binSegData[segOfs + idxInSeg];
216
+ s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
217
+ }
218
+ }
219
+
220
+ // All threads: Clear emit masks.
221
+
222
+ for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
223
+ s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
224
+
225
+ __syncthreads();
226
+
227
+ //------------------------------------------------------------------------
228
+ // Raster.
229
+ //------------------------------------------------------------------------
230
+
231
+ // Triangle per thread: Read from the queue.
232
+
233
+ int triIdx = -1;
234
+ if (triQueueReadPos + thrInBlock < triQueueWritePos)
235
+ triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
236
+
237
+ uint4 triData = make_uint4(0, 0, 0, 0);
238
+ if (triIdx != -1)
239
+ {
240
+ int dataIdx = triIdx >> 3;
241
+ int subtriIdx = triIdx & 7;
242
+ if (subtriIdx != 7)
243
+ dataIdx = triHeader[dataIdx].misc + subtriIdx;
244
+ triData = *((uint4*)triHeader + dataIdx);
245
+ }
246
+
247
+ // 32 triangles per warp: Record emits (= tile intersections).
248
+
249
+ if (__any_sync(~0u, triIdx != -1))
250
+ {
251
+ S32 v0x = sub_s16lo_s16lo(triData.x, originX);
252
+ S32 v0y = sub_s16hi_s16lo(triData.x, originY);
253
+ S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
254
+ S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
255
+ S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
256
+ S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
257
+
258
+ // Compute tile-based AABB.
259
+
260
+ int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
261
+ int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
262
+ int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
263
+ int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
264
+ int sizex = add_sub(hix, 1, lox);
265
+ int sizey = add_sub(hiy, 1, loy);
266
+ int area = sizex * sizey;
267
+
268
+ // Miscellaneous init.
269
+
270
+ U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
271
+ int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
272
+ U32 maskBit = 1 << threadIdx.x;
273
+
274
+ // Case A: All AABBs are small => record the full AABB using atomics.
275
+
276
+ if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
277
+ {
278
+ if (triIdx != -1)
279
+ {
280
+ atomicOr((U32*)currPtr, maskBit);
281
+ if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
282
+ if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
283
+ if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
284
+ }
285
+ }
286
+ else
287
+ {
288
+ // Compute warp-AABB (scan-32).
289
+
290
+ U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
291
+ if (triIdx == -1)
292
+ aabbMask = 0;
293
+
294
+ volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
295
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
296
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
297
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
298
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
299
+ v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
300
+ v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
301
+
302
+ U32 maskX = aabbMask & 0xFFFF;
303
+ U32 maskY = aabbMask >> 16;
304
+ int wlox = findLeadingOne(maskX ^ (maskX - 1));
305
+ int wloy = findLeadingOne(maskY ^ (maskY - 1));
306
+ int whix = findLeadingOne(maskX);
307
+ int whiy = findLeadingOne(maskY);
308
+ int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
309
+
310
+ // Initialize edge functions.
311
+
312
+ S32 d12x = d02x - d01x;
313
+ S32 d12y = d02y - d01y;
314
+ v0x -= lox << tileLog;
315
+ v0y -= loy << tileLog;
316
+
317
+ S32 t01 = v0x * d01y - v0y * d01x;
318
+ S32 t02 = v0y * d02x - v0x * d02y;
319
+ S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
320
+ S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
321
+ S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
322
+ S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
323
+
324
+ d01x += sizex * d01y;
325
+ d02x += sizex * d02y;
326
+ d12x += sizex * d12y;
327
+
328
+ // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
329
+ if (__any_sync(~0u, warea * 4 <= area * 8))
330
+ {
331
+ // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
332
+ bool act = (triIdx != -1);
333
+ U32 actMask = __ballot_sync(~0u, act);
334
+ if (act)
335
+ {
336
+ for (int y = wloy; y <= whiy; y++)
337
+ {
338
+ bool yIn = (y >= loy && y <= hiy);
339
+ U32 yMask = __ballot_sync(actMask, yIn);
340
+ if (yIn)
341
+ {
342
+ for (int x = wlox; x <= whix; x++)
343
+ {
344
+ bool xyIn = (x >= lox && x <= hix);
345
+ U32 xyMask = __ballot_sync(yMask, xyIn);
346
+ if (xyIn)
347
+ {
348
+ U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
349
+ if (threadIdx.x == 31 - __clz(xyMask))
350
+ *(U32*)currPtr = res;
351
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
352
+ }
353
+ }
354
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
355
+ }
356
+ }
357
+ }
358
+ }
359
+
360
+ // Case C: General case => Check tiles in AABB, record using atomics.
361
+
362
+ else
363
+ {
364
+ if (triIdx != -1)
365
+ {
366
+ U8* skipPtr = currPtr + (sizex << 2);
367
+ U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
368
+ do
369
+ {
370
+ if (b01 >= 0 && b02 >= 0 && b12 >= 0)
371
+ atomicOr((U32*)currPtr, maskBit);
372
+ currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
373
+ if (currPtr == skipPtr)
374
+ currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
375
+ }
376
+ while (currPtr != endPtr);
377
+ }
378
+ }
379
+ }
380
+ }
381
+
382
+ __syncthreads();
383
+
384
+ //------------------------------------------------------------------------
385
+ // Count.
386
+ //------------------------------------------------------------------------
387
+
388
+ // Tile per thread: Initialize prefix sums.
389
+
390
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
391
+ {
392
+ int tileInBin = tileInBin_base + thrInBlock;
393
+ bool act = (tileInBin < CR_BIN_SQR);
394
+ U32 actMask = __ballot_sync(~0u, act);
395
+ if (act)
396
+ {
397
+ // Compute prefix sum of emits over warps.
398
+
399
+ U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
400
+ U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
401
+ int tileEmits = 0;
402
+ for (int i = 0; i < CR_COARSE_WARPS; i++)
403
+ {
404
+ tileEmits += __popc(*(U32*)srcPtr);
405
+ *(U32*)dstPtr = tileEmits;
406
+ srcPtr += (CR_BIN_SQR + 1) * 4;
407
+ dstPtr += (CR_BIN_SQR + 1) * 4;
408
+ }
409
+
410
+ // Determine the number of segments to allocate.
411
+
412
+ int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
413
+ int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
414
+ volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
415
+
416
+ // All counters within the warp are small => compute prefix sum using ballot.
417
+
418
+ if (!__any_sync(actMask, tileEmits >= 2))
419
+ {
420
+ U32 m = getLaneMaskLe();
421
+ *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
422
+ }
423
+
424
+ // Otherwise => scan-32 within the warp.
425
+
426
+ else
427
+ {
428
+ U32 sum = (tileEmits << emitShift) | tileAllocs;
429
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
430
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
431
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
432
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
433
+ *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
434
+ *v = sum; __syncwarp(actMask);
435
+ }
436
+ }
437
+ }
438
+
439
+ // First warp: Scan-8.
440
+
441
+ __syncthreads();
442
+
443
+ bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
444
+ U32 scan8Mask = __ballot_sync(~0u, scan8);
445
+ if (scan8)
446
+ {
447
+ int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
448
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
449
+ v[0] = sum; __syncwarp(scan8Mask);
450
+ #if (CR_BIN_SQR > 1 * 32)
451
+ sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
452
+ #endif
453
+ #if (CR_BIN_SQR > 2 * 32)
454
+ sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
455
+ #endif
456
+ #if (CR_BIN_SQR > 4 * 32)
457
+ sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
458
+ #endif
459
+ }
460
+
461
+ __syncthreads();
462
+
463
+ // Tile per thread: Finalize prefix sums.
464
+ // Single thread: Allocate segments.
465
+
466
+ for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
467
+ {
468
+ int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
469
+ int numEmits = sum >> emitShift;
470
+ int numAllocs = sum & ((1 << emitShift) - 1);
471
+ s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
472
+ s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
473
+
474
+ if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
475
+ {
476
+ int t = atomicAdd(&atomics.numTileSegs, numAllocs);
477
+ s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
478
+ }
479
+ }
480
+
481
+ __syncthreads();
482
+ int firstAllocSeg = s_firstAllocSeg;
483
+ int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
484
+ int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
485
+
486
+ //------------------------------------------------------------------------
487
+ // Emit.
488
+ //------------------------------------------------------------------------
489
+
490
+ // Emit per thread: Write triangle index to globalmem.
491
+
492
+ for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
493
+ {
494
+ // Find tile in bin.
495
+
496
+ U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
497
+ U8* tilePtr = tileBase;
498
+ U8* ptr;
499
+
500
+ #if (CR_BIN_SQR > 128)
501
+ ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
502
+ #endif
503
+ #if (CR_BIN_SQR > 64)
504
+ ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
505
+ #endif
506
+ #if (CR_BIN_SQR > 32)
507
+ ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
508
+ #endif
509
+ #if (CR_BIN_SQR > 16)
510
+ ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
511
+ #endif
512
+ #if (CR_BIN_SQR > 8)
513
+ ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
514
+ #endif
515
+ #if (CR_BIN_SQR > 4)
516
+ ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
517
+ #endif
518
+ #if (CR_BIN_SQR > 2)
519
+ ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
520
+ #endif
521
+ #if (CR_BIN_SQR > 1)
522
+ ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
523
+ #endif
524
+
525
+ int tileInBin = (tilePtr - tileBase) >> 2;
526
+ int emitInTile = emitInBin - *(U32*)tilePtr;
527
+
528
+ // Find warp in tile.
529
+
530
+ int warpStep = (CR_BIN_SQR + 1) * 4;
531
+ U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
532
+ U8* warpPtr = warpBase;
533
+
534
+ #if (CR_COARSE_WARPS > 8)
535
+ ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
536
+ #endif
537
+ #if (CR_COARSE_WARPS > 4)
538
+ ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
539
+ #endif
540
+ #if (CR_COARSE_WARPS > 2)
541
+ ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
542
+ #endif
543
+ #if (CR_COARSE_WARPS > 1)
544
+ ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
545
+ #endif
546
+
547
+ int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
548
+ U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
549
+ int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
550
+
551
+ // Find thread in warp.
552
+
553
+ int threadInWarp = 0;
554
+ int pop = __popc(emitMask & 0xFFFF);
555
+ bool pred = (emitInWarp >= pop);
556
+ if (pred) emitInWarp -= pop;
557
+ if (pred) emitMask >>= 0x10;
558
+ if (pred) threadInWarp += 0x10;
559
+
560
+ pop = __popc(emitMask & 0xFF);
561
+ pred = (emitInWarp >= pop);
562
+ if (pred) emitInWarp -= pop;
563
+ if (pred) emitMask >>= 0x08;
564
+ if (pred) threadInWarp += 0x08;
565
+
566
+ pop = __popc(emitMask & 0xF);
567
+ pred = (emitInWarp >= pop);
568
+ if (pred) emitInWarp -= pop;
569
+ if (pred) emitMask >>= 0x04;
570
+ if (pred) threadInWarp += 0x04;
571
+
572
+ pop = __popc(emitMask & 0x3);
573
+ pred = (emitInWarp >= pop);
574
+ if (pred) emitInWarp -= pop;
575
+ if (pred) emitMask >>= 0x02;
576
+ if (pred) threadInWarp += 0x02;
577
+
578
+ if (emitInWarp >= (emitMask & 1))
579
+ threadInWarp++;
580
+
581
+ // Figure out where to write.
582
+
583
+ int currOfs = s_tileStreamCurrOfs[tileInBin];
584
+ int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
585
+ int outOfs = emitInTile;
586
+
587
+ if (outOfs < spaceLeft)
588
+ outOfs += currOfs;
589
+ else
590
+ {
591
+ int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
592
+ outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
593
+ }
594
+
595
+ // Write.
596
+
597
+ int queueIdx = warpInTile * 32 + threadInWarp;
598
+ int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
599
+
600
+ tileSegData[outOfs] = triIdx;
601
+ }
602
+
603
+ //------------------------------------------------------------------------
604
+ // Patch.
605
+ //------------------------------------------------------------------------
606
+
607
+ // Allocated segment per thread: Initialize next-pointer and count.
608
+
609
+ for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
610
+ {
611
+ int segIdx = firstAllocSeg + i;
612
+ tileSegNext[segIdx] = segIdx + 1;
613
+ tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
614
+ }
615
+
616
+ // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
617
+
618
+ __syncthreads();
619
+ for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
620
+ {
621
+ int oldOfs = s_tileStreamCurrOfs[tileInBin];
622
+ int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
623
+ int allocLo = s_tileAllocPrefixSum[tileInBin];
624
+ int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
625
+
626
+ if (allocLo != allocHi)
627
+ {
628
+ S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
629
+ if (oldOfs < 0)
630
+ nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
631
+ *nextPtr = firstAllocSeg + allocLo;
632
+
633
+ newOfs--;
634
+ newOfs &= CR_TILE_SEG_SIZE - 1;
635
+ newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
636
+ newOfs++;
637
+ }
638
+ s_tileStreamCurrOfs[tileInBin] = newOfs;
639
+ }
640
+
641
+ // Advance queue read pointer.
642
+ // Queue became empty => bin done.
643
+
644
+ triQueueReadPos += CR_COARSE_WARPS * 32;
645
+ }
646
+ while (triQueueReadPos < triQueueWritePos);
647
+
648
+ // Tile per thread: Fix next-pointer and count of the last segment.
649
+ // 32 tiles per warp: Count active tiles.
650
+
651
+ __syncthreads();
652
+
653
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
654
+ {
655
+ int tileInBin = tileInBin_base + thrInBlock;
656
+ bool act = (tileInBin < CR_BIN_SQR);
657
+ U32 actMask = __ballot_sync(~0u, act);
658
+ if (act)
659
+ {
660
+ int tileX = tileInBin & (CR_BIN_SIZE - 1);
661
+ int tileY = tileInBin >> CR_BIN_LOG2;
662
+ bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
663
+
664
+ int ofs = s_tileStreamCurrOfs[tileInBin];
665
+ int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
666
+ int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
667
+
668
+ if (ofs >= 0)
669
+ tileSegNext[segIdx] = -1;
670
+ else if (force)
671
+ {
672
+ s_tileStreamCurrOfs[tileInBin] = 0;
673
+ tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
674
+ }
675
+
676
+ if (segCount != 0)
677
+ tileSegCount[segIdx] = segCount;
678
+
679
+ U32 res = __ballot_sync(actMask, ofs >= 0 | force);
680
+ if (threadIdx.x == 0)
681
+ s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
682
+ }
683
+ }
684
+
685
+ // First warp: Scan-8.
686
+ // One thread: Allocate space for active tiles.
687
+
688
+ __syncthreads();
689
+
690
+ bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
691
+ U32 scan8Mask = __ballot_sync(~0u, scan8);
692
+ if (scan8)
693
+ {
694
+ volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
695
+ U32 sum = v[0];
696
+ #if (CR_BIN_SQR > 1 * 32)
697
+ sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
698
+ #endif
699
+ #if (CR_BIN_SQR > 2 * 32)
700
+ sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
701
+ #endif
702
+ #if (CR_BIN_SQR > 4 * 32)
703
+ sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
704
+ #endif
705
+
706
+ if (thrInBlock == CR_BIN_SQR / 32 - 1)
707
+ s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
708
+ }
709
+
710
+ // Tile per thread: Output active tiles.
711
+
712
+ __syncthreads();
713
+
714
+ for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
715
+ {
716
+ int tileInBin = tileInBin_base + thrInBlock;
717
+ bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
718
+ U32 actMask = __ballot_sync(~0u, act);
719
+ if (act)
720
+ {
721
+ int activeIdx = s_firstActiveIdx;
722
+ activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
723
+ activeIdx += __popc(actMask & getLaneMaskLt());
724
+ activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
725
+ }
726
+ }
727
+ }
728
+ }
729
+
730
+ //------------------------------------------------------------------------
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Constants.hpp ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+
11
+ //------------------------------------------------------------------------
12
+
13
+ #define CR_MAXVIEWPORT_LOG2 11 // ViewportSize / PixelSize.
14
+ #define CR_SUBPIXEL_LOG2 4 // PixelSize / SubpixelSize.
15
+
16
+ #define CR_MAXBINS_LOG2 4 // ViewportSize / BinSize.
17
+ #define CR_BIN_LOG2 4 // BinSize / TileSize.
18
+ #define CR_TILE_LOG2 3 // TileSize / PixelSize.
19
+
20
+ #define CR_COVER8X8_LUT_SIZE 768 // 64-bit entries.
21
+ #define CR_FLIPBIT_FLIP_Y 2
22
+ #define CR_FLIPBIT_FLIP_X 3
23
+ #define CR_FLIPBIT_SWAP_XY 4
24
+ #define CR_FLIPBIT_COMPL 5
25
+
26
+ #define CR_BIN_STREAMS_LOG2 4
27
+ #define CR_BIN_SEG_LOG2 9 // 32-bit entries.
28
+ #define CR_TILE_SEG_LOG2 5 // 32-bit entries.
29
+
30
+ #define CR_MAXSUBTRIS_LOG2 24 // Triangle structs. Dictated by CoarseRaster.
31
+ #define CR_COARSE_QUEUE_LOG2 10 // Triangles.
32
+
33
+ #define CR_SETUP_WARPS 2
34
+ #define CR_SETUP_OPT_BLOCKS 8
35
+ #define CR_BIN_WARPS 16
36
+ #define CR_COARSE_WARPS 16 // Must be a power of two.
37
+ #define CR_FINE_MAX_WARPS 20
38
+
39
+ #define CR_EMBED_IMAGE_PARAMS 32 // Number of per-image parameter structs embedded in kernel launch parameter block.
40
+
41
+ //------------------------------------------------------------------------
42
+
43
+ #define CR_MAXVIEWPORT_SIZE (1 << CR_MAXVIEWPORT_LOG2)
44
+ #define CR_SUBPIXEL_SIZE (1 << CR_SUBPIXEL_LOG2)
45
+ #define CR_SUBPIXEL_SQR (1 << (CR_SUBPIXEL_LOG2 * 2))
46
+
47
+ #define CR_MAXBINS_SIZE (1 << CR_MAXBINS_LOG2)
48
+ #define CR_MAXBINS_SQR (1 << (CR_MAXBINS_LOG2 * 2))
49
+ #define CR_BIN_SIZE (1 << CR_BIN_LOG2)
50
+ #define CR_BIN_SQR (1 << (CR_BIN_LOG2 * 2))
51
+
52
+ #define CR_MAXTILES_LOG2 (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
53
+ #define CR_MAXTILES_SIZE (1 << CR_MAXTILES_LOG2)
54
+ #define CR_MAXTILES_SQR (1 << (CR_MAXTILES_LOG2 * 2))
55
+ #define CR_TILE_SIZE (1 << CR_TILE_LOG2)
56
+ #define CR_TILE_SQR (1 << (CR_TILE_LOG2 * 2))
57
+
58
+ #define CR_BIN_STREAMS_SIZE (1 << CR_BIN_STREAMS_LOG2)
59
+ #define CR_BIN_SEG_SIZE (1 << CR_BIN_SEG_LOG2)
60
+ #define CR_TILE_SEG_SIZE (1 << CR_TILE_SEG_LOG2)
61
+
62
+ #define CR_MAXSUBTRIS_SIZE (1 << CR_MAXSUBTRIS_LOG2)
63
+ #define CR_COARSE_QUEUE_SIZE (1 << CR_COARSE_QUEUE_LOG2)
64
+
65
+ //------------------------------------------------------------------------
66
+ // When evaluating interpolated Z pixel centers, we may introduce an error
67
+ // of (+-CR_LERP_ERROR) ULPs.
68
+
69
+ #define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
70
+ #define CR_DEPTH_MIN CR_LERP_ERROR(3)
71
+ #define CR_DEPTH_MAX (CR_U32_MAX - CR_LERP_ERROR(3))
72
+
73
+ //------------------------------------------------------------------------
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CudaRaster.cpp ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include "Defs.hpp"
10
+ #include "../CudaRaster.hpp"
11
+ #include "RasterImpl.hpp"
12
+
13
+ using namespace CR;
14
+
15
+ //------------------------------------------------------------------------
16
+ // Stub interface implementation.
17
+ //------------------------------------------------------------------------
18
+
19
+ CudaRaster::CudaRaster()
20
+ {
21
+ m_impl = new RasterImpl();
22
+ }
23
+
24
+ CudaRaster::~CudaRaster()
25
+ {
26
+ delete m_impl;
27
+ }
28
+
29
+ void CudaRaster::setBufferSize(int width, int height, int numImages)
30
+ {
31
+ m_impl->setBufferSize(Vec3i(width, height, numImages));
32
+ }
33
+
34
+ void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
35
+ {
36
+ m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
37
+ }
38
+
39
+ void CudaRaster::setRenderModeFlags(U32 flags)
40
+ {
41
+ m_impl->setRenderModeFlags(flags);
42
+ }
43
+
44
+ void CudaRaster::deferredClear(U32 clearColor)
45
+ {
46
+ m_impl->deferredClear(clearColor);
47
+ }
48
+
49
+ void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
50
+ {
51
+ m_impl->setVertexBuffer(vertices, numVertices);
52
+ }
53
+
54
+ void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
55
+ {
56
+ m_impl->setIndexBuffer(indices, numTriangles);
57
+ }
58
+
59
+ bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
60
+ {
61
+ return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
62
+ }
63
+
64
+ void* CudaRaster::getColorBuffer(void)
65
+ {
66
+ return m_impl->getColorBuffer();
67
+ }
68
+
69
+ void* CudaRaster::getDepthBuffer(void)
70
+ {
71
+ return m_impl->getDepthBuffer();
72
+ }
73
+
74
+ void CudaRaster::swapDepthAndPeel(void)
75
+ {
76
+ m_impl->swapDepthAndPeel();
77
+ }
78
+
79
+ //------------------------------------------------------------------------
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Defs.hpp ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include <cuda_runtime.h>
11
+ #include <cstdint>
12
+
13
+ namespace CR
14
+ {
15
+ //------------------------------------------------------------------------
16
+
17
+ #ifndef NULL
18
+ # define NULL 0
19
+ #endif
20
+
21
+ #ifdef __CUDACC__
22
+ # define CR_CUDA 1
23
+ #else
24
+ # define CR_CUDA 0
25
+ #endif
26
+
27
+ #if CR_CUDA
28
+ # define CR_CUDA_FUNC __device__ __inline__
29
+ # define CR_CUDA_CONST __constant__
30
+ #else
31
+ # define CR_CUDA_FUNC inline
32
+ # define CR_CUDA_CONST static const
33
+ #endif
34
+
35
+ #define CR_UNREF(X) ((void)(X))
36
+ #define CR_ARRAY_SIZE(X) ((int)(sizeof(X) / sizeof((X)[0])))
37
+
38
+ //------------------------------------------------------------------------
39
+
40
+ typedef uint8_t U8;
41
+ typedef uint16_t U16;
42
+ typedef uint32_t U32;
43
+ typedef uint64_t U64;
44
+ typedef int8_t S8;
45
+ typedef int16_t S16;
46
+ typedef int32_t S32;
47
+ typedef int64_t S64;
48
+ typedef float F32;
49
+ typedef double F64;
50
+ typedef void (*FuncPtr)(void);
51
+
52
+ //------------------------------------------------------------------------
53
+
54
+ #define CR_U32_MAX (0xFFFFFFFFu)
55
+ #define CR_S32_MIN (~0x7FFFFFFF)
56
+ #define CR_S32_MAX (0x7FFFFFFF)
57
+ #define CR_U64_MAX ((U64)(S64)-1)
58
+ #define CR_S64_MIN ((S64)-1 << 63)
59
+ #define CR_S64_MAX (~((S64)-1 << 63))
60
+ #define CR_F32_MIN (1.175494351e-38f)
61
+ #define CR_F32_MAX (3.402823466e+38f)
62
+ #define CR_F64_MIN (2.2250738585072014e-308)
63
+ #define CR_F64_MAX (1.7976931348623158e+308)
64
+
65
+ //------------------------------------------------------------------------
66
+ // Misc types.
67
+
68
+ class Vec2i
69
+ {
70
+ public:
71
+ Vec2i(int x_, int y_) : x(x_), y(y_) {}
72
+ int x, y;
73
+ };
74
+
75
+ class Vec3i
76
+ {
77
+ public:
78
+ Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
79
+ int x, y, z;
80
+ };
81
+
82
+ //------------------------------------------------------------------------
83
+ // CUDA utilities.
84
+
85
+ #if CR_CUDA
86
+ # define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
87
+ #endif
88
+
89
+ //------------------------------------------------------------------------
90
+ } // namespace CR
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_FineRaster.inl ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ //------------------------------------------------------------------------
10
+ // Utility funcs.
11
+ //------------------------------------------------------------------------
12
+
13
+ __device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
14
+ {
15
+ tileZMax = CR_DEPTH_MAX;
16
+ tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
17
+ }
18
+
19
+ __device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
20
+ {
21
+ // Entry is warp-coherent.
22
+ if (__any_sync(~0u, tileZUpd))
23
+ {
24
+ U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
25
+ temp[threadIdx.x + 16] = z; __syncwarp();
26
+ z = ::max(z, temp[threadIdx.x + 16 - 1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
27
+ z = ::max(z, temp[threadIdx.x + 16 - 2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
28
+ z = ::max(z, temp[threadIdx.x + 16 - 4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
29
+ z = ::max(z, temp[threadIdx.x + 16 - 8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
30
+ z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
31
+ tileZMax = temp[47];
32
+ tileZUpd = false;
33
+ }
34
+ }
35
+
36
+ //------------------------------------------------------------------------
37
+
38
+ __device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
39
+ {
40
+ const CRTriangleHeader* triHeaderPtr = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
41
+ const S32* tileSegData = (const S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
42
+ const S32* tileSegNext = (const S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
43
+ const S32* tileSegCount = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
44
+
45
+ if (threadIdx.x >= tileSegCount[segment])
46
+ {
47
+ triIdx = -1;
48
+ dataIdx = -1;
49
+ }
50
+ else
51
+ {
52
+ int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
53
+ triIdx = subtriIdx >> 3;
54
+ dataIdx = triIdx;
55
+ subtriIdx &= 7;
56
+ if (subtriIdx != 7)
57
+ dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
58
+ triHeader = *((uint4*)triHeaderPtr + dataIdx);
59
+ }
60
+
61
+ // advance to next segment
62
+ segment = tileSegNext[segment];
63
+ }
64
+
65
+ //------------------------------------------------------------------------
66
+
67
+ __device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
68
+ {
69
+ U32 zmin = triHeader.w & 0xFFFFF000;
70
+ return (zmin > tileZMax);
71
+ }
72
+
73
+ //------------------------------------------------------------------------
74
+
75
+ __device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
76
+ {
77
+ int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
78
+ int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
79
+
80
+ // extract S16 vertex positions while subtracting tile coordinates
81
+ S32 v0x = sub_s16lo_s16lo(triHeader.x, baseX);
82
+ S32 v0y = sub_s16hi_s16lo(triHeader.x, baseY);
83
+ S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
84
+ S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
85
+ S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
86
+ S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
87
+
88
+ // extract flipbits
89
+ U32 f01 = (triHeader.w >> 6) & 0x3C;
90
+ U32 f12 = (triHeader.w >> 2) & 0x3C;
91
+ U32 f20 = (triHeader.w << 2) & 0x3C;
92
+
93
+ // compute per-edge coverage masks
94
+ U64 c01, c12, c20;
95
+ c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
96
+ c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
97
+ c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
98
+
99
+ // combine masks
100
+ return c01 & c12 & c20;
101
+ }
102
+
103
+ //------------------------------------------------------------------------
104
+
105
+ __device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
106
+ {
107
+ __syncwarp();
108
+ temp[threadIdx.x + 16] = value; __syncwarp();
109
+ value += temp[threadIdx.x + 16 - 1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
110
+ value += temp[threadIdx.x + 16 - 2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
111
+ value += temp[threadIdx.x + 16 - 4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
112
+ value += temp[threadIdx.x + 16 - 8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
113
+ value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
114
+ return value;
115
+ }
116
+
117
+ __device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
118
+ {
119
+ return temp[47];
120
+ }
121
+
122
+ //------------------------------------------------------------------------
123
+
124
+ __device__ __inline__ S32 findBit(U64 mask, int idx)
125
+ {
126
+ U32 x = getLo(mask);
127
+ int pop = __popc(x);
128
+ bool p = (pop <= idx);
129
+ if (p) x = getHi(mask);
130
+ if (p) idx -= pop;
131
+ int bit = p ? 32 : 0;
132
+
133
+ pop = __popc(x & 0x0000ffffu);
134
+ p = (pop <= idx);
135
+ if (p) x >>= 16;
136
+ if (p) bit += 16;
137
+ if (p) idx -= pop;
138
+
139
+ U32 tmp = x & 0x000000ffu;
140
+ pop = __popc(tmp);
141
+ p = (pop <= idx);
142
+ if (p) tmp = x & 0x0000ff00u;
143
+ if (p) idx -= pop;
144
+
145
+ return findLeadingOne(tmp) + bit - idx;
146
+ }
147
+
148
+ //------------------------------------------------------------------------
149
+ // Single-sample implementation.
150
+ //------------------------------------------------------------------------
151
+
152
+ __device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
153
+ {
154
+ atomicMin((U32*)pDepth, depth);
155
+ __syncwarp(ropMask);
156
+ bool act = (depth == *pDepth);
157
+ __syncwarp(ropMask);
158
+ U32 actMask = __ballot_sync(ropMask, act);
159
+ if (act)
160
+ {
161
+ *pDepth = 0;
162
+ __syncwarp(actMask);
163
+ atomicMax((U32*)pDepth, threadIdx.x);
164
+ __syncwarp(actMask);
165
+ if (*pDepth == threadIdx.x)
166
+ {
167
+ *pDepth = depth;
168
+ *pColor = color;
169
+ }
170
+ __syncwarp(actMask);
171
+ }
172
+ }
173
+
174
+ //------------------------------------------------------------------------
175
+
176
+ __device__ __inline__ void fineRasterImpl(const CRParams p)
177
+ {
178
+ // for 20 warps:
179
+ __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE]; // 6KB
180
+ __shared__ volatile U32 s_tileColor [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
181
+ __shared__ volatile U32 s_tileDepth [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
182
+ __shared__ volatile U32 s_tilePeel [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
183
+ __shared__ volatile U32 s_triDataIdx [CR_FINE_MAX_WARPS][64]; // 5KB CRTriangleData index
184
+ __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64]; // 10KB coverage mask
185
+ __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64]; // 5KB fragment index
186
+ __shared__ volatile U32 s_temp [CR_FINE_MAX_WARPS][80]; // 6.25KB
187
+ // = 47.25KB total
188
+
189
+ CRAtomics& atomics = p.atomics[blockIdx.z];
190
+ const CRTriangleData* triData = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
191
+
192
+ const S32* activeTiles = (const S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
193
+ const S32* tileFirstSeg = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
194
+
195
+ volatile U32* tileColor = s_tileColor[threadIdx.y];
196
+ volatile U32* tileDepth = s_tileDepth[threadIdx.y];
197
+ volatile U32* tilePeel = s_tilePeel[threadIdx.y];
198
+ volatile U32* triDataIdx = s_triDataIdx[threadIdx.y];
199
+ volatile U64* triangleCov = s_triangleCov[threadIdx.y];
200
+ volatile U32* triangleFrag = s_triangleFrag[threadIdx.y];
201
+ volatile U32* temp = s_temp[threadIdx.y];
202
+
203
+ if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
204
+ return;
205
+
206
+ temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
207
+ cover8x8_setupLUT(s_cover8x8_lut);
208
+ __syncthreads();
209
+
210
+ // loop over tiles
211
+ for (;;)
212
+ {
213
+ // pick a tile
214
+ if (threadIdx.x == 0)
215
+ temp[16] = atomicAdd(&atomics.fineCounter, 1);
216
+ __syncwarp();
217
+ int activeIdx = temp[16];
218
+ if (activeIdx >= atomics.numActiveTiles)
219
+ break;
220
+
221
+ int tileIdx = activeTiles[activeIdx];
222
+ S32 segment = tileFirstSeg[tileIdx];
223
+ int tileY = tileIdx / p.widthTiles;
224
+ int tileX = tileIdx - tileY * p.widthTiles;
225
+ int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
226
+ int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
227
+
228
+ // initialize per-tile state
229
+ int triRead = 0, triWrite = 0;
230
+ int fragRead = 0, fragWrite = 0;
231
+ if (threadIdx.x == 0)
232
+ triangleFrag[63] = 0; // "previous triangle"
233
+
234
+ // deferred clear => clear tile
235
+ if (p.deferredClear)
236
+ {
237
+ tileColor[threadIdx.x] = p.clearColor;
238
+ tileDepth[threadIdx.x] = p.clearDepth;
239
+ tileColor[threadIdx.x + 32] = p.clearColor;
240
+ tileDepth[threadIdx.x + 32] = p.clearDepth;
241
+ }
242
+ else // otherwise => read tile from framebuffer
243
+ {
244
+ U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
245
+ U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
246
+ tileColor[threadIdx.x] = pColor[px + p.strideX * py];
247
+ tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
248
+ tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
249
+ tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
250
+ }
251
+
252
+ // read peeling inputs if enabled
253
+ if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
254
+ {
255
+ U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
256
+ tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
257
+ tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
258
+ }
259
+
260
+ U32 tileZMax;
261
+ bool tileZUpd;
262
+ initTileZMax(tileZMax, tileZUpd, tileDepth);
263
+
264
+ // process fragments
265
+ for(;;)
266
+ {
267
+ // need to queue more fragments?
268
+ if (fragWrite - fragRead < 32 && segment >= 0)
269
+ {
270
+ // update tile z - coherent over warp
271
+ updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
272
+
273
+ // read triangles
274
+ do
275
+ {
276
+ // read triangle index and data, advance to next segment
277
+ S32 triIdx, dataIdx;
278
+ uint4 triHeader;
279
+ getTriangle(p, triIdx, dataIdx, triHeader, segment);
280
+
281
+ // early z cull
282
+ if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
283
+ triIdx = -1;
284
+
285
+ // determine coverage
286
+ U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
287
+ S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
288
+
289
+ // fragment count scan
290
+ U32 frag = scan32_value(pop, temp);
291
+ frag += fragWrite; // frag now holds cumulative fragment count
292
+ fragWrite += scan32_total(temp);
293
+
294
+ // queue non-empty triangles
295
+ U32 goodMask = __ballot_sync(~0u, pop != 0);
296
+ if (pop != 0)
297
+ {
298
+ int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
299
+ triDataIdx [idx] = dataIdx;
300
+ triangleFrag[idx] = frag;
301
+ triangleCov [idx] = coverage;
302
+ }
303
+ triWrite += __popc(goodMask);
304
+ }
305
+ while (fragWrite - fragRead < 32 && segment >= 0);
306
+ }
307
+ __syncwarp();
308
+
309
+ // end of segment?
310
+ if (fragRead == fragWrite)
311
+ break;
312
+
313
+ // clear triangle boundaries
314
+ temp[threadIdx.x + 16] = 0;
315
+ __syncwarp();
316
+
317
+ // tag triangle boundaries
318
+ if (triRead + threadIdx.x < triWrite)
319
+ {
320
+ int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
321
+ if (idx <= 32)
322
+ temp[idx + 16 - 1] = 1;
323
+ }
324
+ __syncwarp();
325
+
326
+ int ropLaneIdx = threadIdx.x;
327
+ U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
328
+
329
+ // distribute fragments
330
+ bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
331
+ U32 fragmentMask = __ballot_sync(~0u, hasFragment);
332
+ if (hasFragment)
333
+ {
334
+ int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
335
+ int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
336
+ U64 coverage = triangleCov[triBufIdx];
337
+ int pixelInTile = findBit(coverage, fragIdx);
338
+ int dataIdx = triDataIdx[triBufIdx];
339
+
340
+ // determine pixel position
341
+ U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
342
+ U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
343
+
344
+ // depth test
345
+ U32 depth = 0;
346
+ uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
347
+
348
+ depth = td.x * pixelX + td.y * pixelY + td.z;
349
+ bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
350
+ if (!zkill)
351
+ {
352
+ U32 oldDepth = tileDepth[pixelInTile];
353
+ if (depth > oldDepth)
354
+ zkill = true;
355
+ else if (oldDepth == tileZMax)
356
+ tileZUpd = true; // we are replacing previous zmax => need to update
357
+ }
358
+
359
+ U32 ropMask = __ballot_sync(fragmentMask, !zkill);
360
+ if (!zkill)
361
+ executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
362
+ }
363
+ // no need to sync, as next up is updateTileZMax that does internal warp sync
364
+
365
+ // update counters
366
+ fragRead = ::min(fragRead + 32, fragWrite);
367
+ triRead += __popc(boundaryMask);
368
+ }
369
+
370
+ // Write tile back to the framebuffer.
371
+ if (true)
372
+ {
373
+ int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
374
+ int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
375
+ U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
376
+ U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
377
+ pColor[px + p.strideX * py] = tileColor[threadIdx.x];
378
+ pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
379
+ pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
380
+ pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
381
+ }
382
+ }
383
+ }
384
+
385
+ //------------------------------------------------------------------------
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_PrivateDefs.hpp ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #pragma once
10
+ #include "Defs.hpp"
11
+ #include "Constants.hpp"
12
+
13
+ namespace CR
14
+ {
15
+ //------------------------------------------------------------------------
16
+ // Projected triangle.
17
+ //------------------------------------------------------------------------
18
+
19
+ struct CRTriangleHeader
20
+ {
21
+ S16 v0x; // Subpixels relative to viewport center. Valid if triSubtris = 1.
22
+ S16 v0y;
23
+ S16 v1x;
24
+ S16 v1y;
25
+ S16 v2x;
26
+ S16 v2y;
27
+
28
+ U32 misc; // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
29
+ };
30
+
31
+ //------------------------------------------------------------------------
32
+
33
+ struct CRTriangleData
34
+ {
35
+ U32 zx; // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
36
+ U32 zy;
37
+ U32 zb;
38
+ U32 id; // Triangle id.
39
+ };
40
+
41
+ //------------------------------------------------------------------------
42
+ // Device-side structures.
43
+ //------------------------------------------------------------------------
44
+
45
+ struct CRAtomics
46
+ {
47
+ // Setup.
48
+ S32 numSubtris; // = numTris
49
+
50
+ // Bin.
51
+ S32 binCounter; // = 0
52
+ S32 numBinSegs; // = 0
53
+
54
+ // Coarse.
55
+ S32 coarseCounter; // = 0
56
+ S32 numTileSegs; // = 0
57
+ S32 numActiveTiles; // = 0
58
+
59
+ // Fine.
60
+ S32 fineCounter; // = 0
61
+ };
62
+
63
+ //------------------------------------------------------------------------
64
+
65
+ struct CRImageParams
66
+ {
67
+ S32 triOffset; // First triangle index to draw.
68
+ S32 triCount; // Number of triangles to draw.
69
+ S32 binBatchSize; // Number of triangles per batch.
70
+ };
71
+
72
+ //------------------------------------------------------------------------
73
+
74
+ struct CRParams
75
+ {
76
+ // Common.
77
+
78
+ CRAtomics* atomics; // Work counters. Per-image.
79
+ S32 numImages; // Batch size.
80
+ S32 totalCount; // In range mode, total number of triangles to render.
81
+ S32 instanceMode; // 0 = range mode, 1 = instance mode.
82
+
83
+ S32 numVertices; // Number of vertices in input buffer, not counting multiples in instance mode.
84
+ S32 numTriangles; // Number of triangles in input buffer.
85
+ void* vertexBuffer; // numVertices * float4(x, y, z, w)
86
+ void* indexBuffer; // numTriangles * int3(vi0, vi1, vi2)
87
+
88
+ S32 widthPixels; // Render buffer size in pixels. Must be multiple of tile size (8x8).
89
+ S32 heightPixels;
90
+ S32 widthPixelsVp; // Viewport size in pixels.
91
+ S32 heightPixelsVp;
92
+ S32 widthBins; // widthPixels / CR_BIN_SIZE
93
+ S32 heightBins; // heightPixels / CR_BIN_SIZE
94
+ S32 numBins; // widthBins * heightBins
95
+
96
+ F32 xs; // Vertex position adjustments for tiled rendering.
97
+ F32 ys;
98
+ F32 xo;
99
+ F32 yo;
100
+
101
+ S32 widthTiles; // widthPixels / CR_TILE_SIZE
102
+ S32 heightTiles; // heightPixels / CR_TILE_SIZE
103
+ S32 numTiles; // widthTiles * heightTiles
104
+
105
+ U32 renderModeFlags;
106
+ S32 deferredClear; // 1 = Clear framebuffer before rendering triangles.
107
+ U32 clearColor;
108
+ U32 clearDepth;
109
+
110
+ // These are uniform across batch.
111
+
112
+ S32 maxSubtris;
113
+ S32 maxBinSegs;
114
+ S32 maxTileSegs;
115
+
116
+ // Setup output / bin input.
117
+
118
+ void* triSubtris; // maxSubtris * U8
119
+ void* triHeader; // maxSubtris * CRTriangleHeader
120
+ void* triData; // maxSubtris * CRTriangleData
121
+
122
+ // Bin output / coarse input.
123
+
124
+ void* binSegData; // maxBinSegs * CR_BIN_SEG_SIZE * S32
125
+ void* binSegNext; // maxBinSegs * S32
126
+ void* binSegCount; // maxBinSegs * S32
127
+ void* binFirstSeg; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
128
+ void* binTotal; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
129
+
130
+ // Coarse output / fine input.
131
+
132
+ void* tileSegData; // maxTileSegs * CR_TILE_SEG_SIZE * S32
133
+ void* tileSegNext; // maxTileSegs * S32
134
+ void* tileSegCount; // maxTileSegs * S32
135
+ void* activeTiles; // CR_MAXTILES_SQR * (S32 tileIdx)
136
+ void* tileFirstSeg; // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
137
+
138
+ // Surface buffers. Outer tile offset is baked into pointers.
139
+
140
+ void* colorBuffer; // sizePixels.x * sizePixels.y * numImages * U32
141
+ void* depthBuffer; // sizePixels.x * sizePixels.y * numImages * U32
142
+ void* peelBuffer; // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
143
+ S32 strideX; // horizontal size in pixels
144
+ S32 strideY; // vertical stride in pixels
145
+
146
+ // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
147
+
148
+ CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
149
+ const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
150
+ };
151
+
152
+ //------------------------------------------------------------------------
153
+ }
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_RasterImpl.cpp ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
+ //
3
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ // and proprietary rights in and to this software, related documentation
5
+ // and any modifications thereto. Any use, reproduction, disclosure or
6
+ // distribution of this software and related documentation without an express
7
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ #include "../../framework.h"
10
+ #include "PrivateDefs.hpp"
11
+ #include "Constants.hpp"
12
+ #include "RasterImpl.hpp"
13
+ #include <cuda_runtime.h>
14
+
15
+ using namespace CR;
16
+ using std::min;
17
+ using std::max;
18
+
19
+ //------------------------------------------------------------------------
20
+ // Kernel prototypes and variables.
21
+
22
+ void triangleSetupKernel (const CRParams p);
23
+ void binRasterKernel (const CRParams p);
24
+ void coarseRasterKernel (const CRParams p);
25
+ void fineRasterKernel (const CRParams p);
26
+
27
+ //------------------------------------------------------------------------
28
+
29
+ RasterImpl::RasterImpl(void)
30
+ : m_renderModeFlags (0),
31
+ m_deferredClear (false),
32
+ m_clearColor (0),
33
+ m_vertexPtr (NULL),
34
+ m_indexPtr (NULL),
35
+ m_numVertices (0),
36
+ m_numTriangles (0),
37
+ m_bufferSizesReported (0),
38
+
39
+ m_numImages (0),
40
+ m_bufferSizePixels (0, 0),
41
+ m_bufferSizeVp (0, 0),
42
+ m_sizePixels (0, 0),
43
+ m_sizeVp (0, 0),
44
+ m_offsetPixels (0, 0),
45
+ m_sizeBins (0, 0),
46
+ m_numBins (0),
47
+ m_sizeTiles (0, 0),
48
+ m_numTiles (0),
49
+
50
+ m_numSMs (1),
51
+ m_numCoarseBlocksPerSM (1),
52
+ m_numFineBlocksPerSM (1),
53
+ m_numFineWarpsPerBlock (1),
54
+
55
+ m_maxSubtris (1),
56
+ m_maxBinSegs (1),
57
+ m_maxTileSegs (1)
58
+ {
59
+ // Query relevant device attributes.
60
+
61
+ int currentDevice = 0;
62
+ NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
63
+ NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
64
+ cudaFuncAttributes attr;
65
+ NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
66
+ m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
67
+ NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
68
+ NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
69
+
70
+ // Setup functions.
71
+
72
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
73
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel, cudaFuncCachePreferShared));
74
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel, cudaFuncCachePreferShared));
75
+ NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel, cudaFuncCachePreferShared));
76
+ }
77
+
78
+ //------------------------------------------------------------------------
79
+
80
+ RasterImpl::~RasterImpl(void)
81
+ {
82
+ // Empty.
83
+ }
84
+
85
+ //------------------------------------------------------------------------
86
+
87
+ void RasterImpl::setBufferSize(Vec3i size)
88
+ {
89
+ // Internal buffer width and height must be divisible by tile size.
90
+ int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
91
+ int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
92
+
93
+ m_bufferSizePixels = Vec2i(w, h);
94
+ m_bufferSizeVp = Vec2i(size.x, size.y);
95
+ m_numImages = size.z;
96
+
97
+ m_colorBuffer.reset(w * h * size.z * sizeof(U32));
98
+ m_depthBuffer.reset(w * h * size.z * sizeof(U32));
99
+ }
100
+
101
+ //------------------------------------------------------------------------
102
+
103
+ void RasterImpl::setViewport(Vec2i size, Vec2i offset)
104
+ {
105
+ // Offset must be divisible by tile size.
106
+ NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
107
+
108
+ // Round internal viewport size to multiples of tile size.
109
+ int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
110
+ int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
111
+
112
+ m_sizePixels = Vec2i(w, h);
113
+ m_offsetPixels = offset;
114
+ m_sizeVp = Vec2i(size.x, size.y);
115
+ m_sizeTiles.x = m_sizePixels.x >> CR_TILE_LOG2;
116
+ m_sizeTiles.y = m_sizePixels.y >> CR_TILE_LOG2;
117
+ m_numTiles = m_sizeTiles.x * m_sizeTiles.y;
118
+ m_sizeBins.x = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
119
+ m_sizeBins.y = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
120
+ m_numBins = m_sizeBins.x * m_sizeBins.y;
121
+ }
122
+
123
+ void RasterImpl::swapDepthAndPeel(void)
124
+ {
125
+ m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
126
+
127
+ void* tmp = m_depthBuffer.getPtr();
128
+ m_depthBuffer.setPtr(m_peelBuffer.getPtr());
129
+ m_peelBuffer.setPtr(tmp);
130
+ }
131
+
132
+ //------------------------------------------------------------------------
133
+
134
+ bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
135
+ {
136
+ bool instanceMode = (!ranges);
137
+
138
+ int maxSubtrisSlack = 4096; // x 81B = 324KB
139
+ int maxBinSegsSlack = 256; // x 2137B = 534KB
140
+ int maxTileSegsSlack = 4096; // x 136B = 544KB
141
+
142
+ // Resize atomics as needed.
143
+ m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
144
+ m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
145
+
146
+ // Size of these buffers doesn't depend on input.
147
+ m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
148
+ m_binTotal .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
149
+ m_activeTiles .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
150
+ m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
151
+
152
+ // Construct per-image parameters and determine worst-case buffer sizes.
153
+ m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
154
+ CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
155
+ for (int i=0; i < m_numImages; i++)
156
+ {
157
+ CRImageParams& ip = imageParams[i];
158
+
159
+ int roundSize = CR_BIN_WARPS * 32;
160
+ int minBatches = CR_BIN_STREAMS_SIZE * 2;
161
+ int maxRounds = 32;
162
+
163
+ ip.triOffset = instanceMode ? 0 : ranges[i].x;
164
+ ip.triCount = instanceMode ? m_numTriangles : ranges[i].y;
165
+ ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
166
+
167
+ m_maxSubtris = max(m_maxSubtris, min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
168
+ m_maxBinSegs = max(m_maxBinSegs, max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
169
+ m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
170
+ }
171
+
172
+ // Retry until successful.
173
+
174
+ for (;;)
175
+ {
176
+ // Allocate buffers.
177
+ m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
178
+ m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
179
+ m_triData .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
180
+
181
+ m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
182
+ m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
183
+ m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
184
+
185
+ m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
186
+ m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
187
+ m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
188
+
189
+ // Report if buffers grow from last time.
190
+ size_t sizesTotal = getTotalBufferSizes();
191
+ if (sizesTotal > m_bufferSizesReported)
192
+ {
193
+ size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
194
+ sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
195
+ LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
196
+ m_bufferSizesReported = sizesMB << 20;
197
+ }
198
+
199
+ // Launch stages. Blocks until everything is done.
200
+ launchStages(instanceMode, peel, stream);
201
+
202
+ // Peeling iteration cannot fail, so no point checking things further.
203
+ if (peel)
204
+ break;
205
+
206
+ // Atomics after coarse stage are now available.
207
+ CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
208
+
209
+ // Success?
210
+ bool failed = false;
211
+ for (int i=0; i < m_numImages; i++)
212
+ {
213
+ const CRAtomics& a = atomics[i];
214
+ failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
215
+ }
216
+ if (!failed)
217
+ break; // Success!
218
+
219
+ // If we were already at maximum capacity, no can do.
220
+ if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
221
+ return false;
222
+
223
+ // Enlarge buffers and try again.
224
+ for (int i=0; i < m_numImages; i++)
225
+ {
226
+ const CRAtomics& a = atomics[i];
227
+ m_maxSubtris = max(m_maxSubtris, min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
228
+ m_maxBinSegs = max(m_maxBinSegs, a.numBinSegs + maxBinSegsSlack);
229
+ m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
230
+ }
231
+ }
232
+
233
+ m_deferredClear = false;
234
+ return true; // Success.
235
+ }
236
+
237
+ //------------------------------------------------------------------------
238
+
239
+ size_t RasterImpl::getTotalBufferSizes(void) const
240
+ {
241
+ return
242
+ m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
243
+ m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
244
+ m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
245
+ m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
246
+ }
247
+
248
+ //------------------------------------------------------------------------
249
+
250
+ void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
251
+ {
252
+ CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
253
+
254
+ // Unless peeling, initialize atomics to mostly zero.
255
+ CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
256
+ if (!peel)
257
+ {
258
+ memset(atomics, 0, m_numImages * sizeof(CRAtomics));
259
+ for (int i=0; i < m_numImages; i++)
260
+ atomics[i].numSubtris = imageParams[i].triCount;
261
+ }
262
+
263
+ // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
264
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
265
+
266
+ // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
267
+ if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
268
+ {
269
+ int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
270
+ m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
271
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
272
+ }
273
+
274
+ // Set global parameters.
275
+ CRParams p;
276
+ {
277
+ p.atomics = (CRAtomics*)m_crAtomics.getPtr();
278
+ p.numImages = m_numImages;
279
+ p.totalCount = 0; // Only relevant in range mode.
280
+ p.instanceMode = instanceMode ? 1 : 0;
281
+
282
+ p.numVertices = m_numVertices;
283
+ p.numTriangles = m_numTriangles;
284
+ p.vertexBuffer = m_vertexPtr;
285
+ p.indexBuffer = m_indexPtr;
286
+
287
+ p.widthPixels = m_sizePixels.x;
288
+ p.heightPixels = m_sizePixels.y;
289
+ p.widthPixelsVp = m_sizeVp.x;
290
+ p.heightPixelsVp = m_sizeVp.y;
291
+ p.widthBins = m_sizeBins.x;
292
+ p.heightBins = m_sizeBins.y;
293
+ p.numBins = m_numBins;
294
+
295
+ p.xs = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
296
+ p.ys = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
297
+ p.xo = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
298
+ p.yo = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
299
+
300
+ p.widthTiles = m_sizeTiles.x;
301
+ p.heightTiles = m_sizeTiles.y;
302
+ p.numTiles = m_numTiles;
303
+
304
+ p.renderModeFlags = m_renderModeFlags;
305
+ p.deferredClear = m_deferredClear ? 1 : 0;
306
+ p.clearColor = m_clearColor;
307
+ p.clearDepth = CR_DEPTH_MAX;
308
+
309
+ p.maxSubtris = m_maxSubtris;
310
+ p.maxBinSegs = m_maxBinSegs;
311
+ p.maxTileSegs = m_maxTileSegs;
312
+
313
+ p.triSubtris = m_triSubtris.getPtr();
314
+ p.triHeader = m_triHeader.getPtr();
315
+ p.triData = m_triData.getPtr();
316
+ p.binSegData = m_binSegData.getPtr();
317
+ p.binSegNext = m_binSegNext.getPtr();
318
+ p.binSegCount = m_binSegCount.getPtr();
319
+ p.binFirstSeg = m_binFirstSeg.getPtr();
320
+ p.binTotal = m_binTotal.getPtr();
321
+ p.tileSegData = m_tileSegData.getPtr();
322
+ p.tileSegNext = m_tileSegNext.getPtr();
323
+ p.tileSegCount = m_tileSegCount.getPtr();
324
+ p.activeTiles = m_activeTiles.getPtr();
325
+ p.tileFirstSeg = m_tileFirstSeg.getPtr();
326
+
327
+ size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
328
+ p.colorBuffer = m_colorBuffer.getPtr(byteOffset);
329
+ p.depthBuffer = m_depthBuffer.getPtr(byteOffset);
330
+ p.peelBuffer = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
331
+ p.strideX = m_bufferSizePixels.x;
332
+ p.strideY = m_bufferSizePixels.y;
333
+
334
+ memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
335
+ p.imageParamsExtra = (CRImageParams*)m_crImageParamsExtra.getPtr();
336
+ }
337
+
338
+ // Setup block sizes.
339
+
340
+ dim3 brBlock(32, CR_BIN_WARPS);
341
+ dim3 crBlock(32, CR_COARSE_WARPS);
342
+ dim3 frBlock(32, m_numFineWarpsPerBlock);
343
+ void* args[] = {&p};
344
+
345
+ // Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
346
+ if (!peel)
347
+ {
348
+ if (instanceMode)
349
+ {
350
+ int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
351
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
352
+ }
353
+ else
354
+ {
355
+ for (int i=0; i < m_numImages; i++)
356
+ p.totalCount += imageParams[i].triCount;
357
+ int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
358
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
359
+ }
360
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
361
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
362
+ NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
363
+ }
364
+
365
+ // Fine rasterizer is launched always.
366
+ NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
367
+ NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
368
+ }
369
+
370
+ //------------------------------------------------------------------------