Upload 174 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +5 -0
- 3d.mp4 +3 -0
- README.md +11 -0
- app.py +2 -0
- assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp +3 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp +3 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp +0 -0
- assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp +0 -0
- extensions/extensions_nvdiffrast_LICENSE.txt +97 -0
- extensions/extensions_nvdiffrast_README.md +42 -0
- extensions/extensions_nvdiffrast_run_sample.sh +52 -0
- extensions/extensions_nvdiffrast_setup copy.py +51 -0
- extensions/extensions_nvdiffrast_setup.py +82 -0
- extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp +63 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl +423 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp +94 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp +55 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl +730 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Constants.hpp +73 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CudaRaster.cpp +79 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Defs.hpp +90 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_FineRaster.inl +385 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_PrivateDefs.hpp +153 -0
- extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_RasterImpl.cpp +370 -0
.gitattributes
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
3d.mp4 filter=lfs diff=lfs merge=lfs -text
|
2 |
+
assets/example_image/assets_example_image_image[[:space:]]-[[:space:]]2024-12-09T184251.254.webp filter=lfs diff=lfs merge=lfs -text
|
3 |
+
assets/example_image/assets_example_image_image[[:space:]]-[[:space:]]2024-12-09T184511.907.webp filter=lfs diff=lfs merge=lfs -text
|
4 |
+
wheels/nvdiffrast-0.3.3-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
5 |
+
wheels/wheels_diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
3d.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c3282465210bac76f44b605956139679ed774c8bad9be686707d1b770961371
|
3 |
+
size 21309978
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: SORA 3D
|
3 |
+
emoji: 🏢🏆
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.44.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
short_description: Create top-quality 3D(.GLB) models from text or images
|
11 |
+
---
|
app.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
exec(os.environ.get('APP'))
|
assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp
ADDED
![]() |
Git LFS Details
|
assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp
ADDED
![]() |
Git LFS Details
|
assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp
ADDED
![]() |
assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp
ADDED
![]() |
extensions/extensions_nvdiffrast_LICENSE.txt
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
|
2 |
+
|
3 |
+
|
4 |
+
Nvidia Source Code License (1-Way Commercial)
|
5 |
+
|
6 |
+
=======================================================================
|
7 |
+
|
8 |
+
1. Definitions
|
9 |
+
|
10 |
+
"Licensor" means any person or entity that distributes its Work.
|
11 |
+
|
12 |
+
"Software" means the original work of authorship made available under
|
13 |
+
this License.
|
14 |
+
|
15 |
+
"Work" means the Software and any additions to or derivative works of
|
16 |
+
the Software that are made available under this License.
|
17 |
+
|
18 |
+
The terms "reproduce," "reproduction," "derivative works," and
|
19 |
+
"distribution" have the meaning as provided under U.S. copyright law;
|
20 |
+
provided, however, that for the purposes of this License, derivative
|
21 |
+
works shall not include works that remain separable from, or merely
|
22 |
+
link (or bind by name) to the interfaces of, the Work.
|
23 |
+
|
24 |
+
Works, including the Software, are "made available" under this License
|
25 |
+
by including in or with the Work either (a) a copyright notice
|
26 |
+
referencing the applicability of this License to the Work, or (b) a
|
27 |
+
copy of this License.
|
28 |
+
|
29 |
+
2. License Grants
|
30 |
+
|
31 |
+
2.1 Copyright Grant. Subject to the terms and conditions of this
|
32 |
+
License, each Licensor grants to you a perpetual, worldwide,
|
33 |
+
non-exclusive, royalty-free, copyright license to reproduce,
|
34 |
+
prepare derivative works of, publicly display, publicly perform,
|
35 |
+
sublicense and distribute its Work and any resulting derivative
|
36 |
+
works in any form.
|
37 |
+
|
38 |
+
3. Limitations
|
39 |
+
|
40 |
+
3.1 Redistribution. You may reproduce or distribute the Work only
|
41 |
+
if (a) you do so under this License, (b) you include a complete
|
42 |
+
copy of this License with your distribution, and (c) you retain
|
43 |
+
without modification any copyright, patent, trademark, or
|
44 |
+
attribution notices that are present in the Work.
|
45 |
+
|
46 |
+
3.2 Derivative Works. You may specify that additional or different
|
47 |
+
terms apply to the use, reproduction, and distribution of your
|
48 |
+
derivative works of the Work ("Your Terms") only if (a) Your Terms
|
49 |
+
provide that the use limitation in Section 3.3 applies to your
|
50 |
+
derivative works, and (b) you identify the specific derivative
|
51 |
+
works that are subject to Your Terms. Notwithstanding Your Terms,
|
52 |
+
this License (including the redistribution requirements in Section
|
53 |
+
3.1) will continue to apply to the Work itself.
|
54 |
+
|
55 |
+
3.3 Use Limitation. The Work and any derivative works thereof only
|
56 |
+
may be used or intended for use non-commercially. The Work or
|
57 |
+
derivative works thereof may be used or intended for use by Nvidia
|
58 |
+
or its affiliates commercially or non-commercially. As used herein,
|
59 |
+
"non-commercially" means for research or evaluation purposes only
|
60 |
+
and not for any direct or indirect monetary gain.
|
61 |
+
|
62 |
+
3.4 Patent Claims. If you bring or threaten to bring a patent claim
|
63 |
+
against any Licensor (including any claim, cross-claim or
|
64 |
+
counterclaim in a lawsuit) to enforce any patents that you allege
|
65 |
+
are infringed by any Work, then your rights under this License from
|
66 |
+
such Licensor (including the grant in Section 2.1) will terminate
|
67 |
+
immediately.
|
68 |
+
|
69 |
+
3.5 Trademarks. This License does not grant any rights to use any
|
70 |
+
Licensor's or its affiliates' names, logos, or trademarks, except
|
71 |
+
as necessary to reproduce the notices described in this License.
|
72 |
+
|
73 |
+
3.6 Termination. If you violate any term of this License, then your
|
74 |
+
rights under this License (including the grant in Section 2.1) will
|
75 |
+
terminate immediately.
|
76 |
+
|
77 |
+
4. Disclaimer of Warranty.
|
78 |
+
|
79 |
+
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
80 |
+
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
|
81 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
|
82 |
+
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
|
83 |
+
THIS LICENSE.
|
84 |
+
|
85 |
+
5. Limitation of Liability.
|
86 |
+
|
87 |
+
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
|
88 |
+
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
|
89 |
+
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
|
90 |
+
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
|
91 |
+
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
|
92 |
+
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
|
93 |
+
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
|
94 |
+
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
|
95 |
+
THE POSSIBILITY OF SUCH DAMAGES.
|
96 |
+
|
97 |
+
=======================================================================
|
extensions/extensions_nvdiffrast_README.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
|
2 |
+
|
3 |
+

|
4 |
+
|
5 |
+
**Modular Primitives for High-Performance Differentiable Rendering**<br>
|
6 |
+
Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
|
7 |
+
[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
|
8 |
+
|
9 |
+
Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
|
10 |
+
Please refer to ☞☞ [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) ☜☜ for more information.
|
11 |
+
|
12 |
+
## Licenses
|
13 |
+
|
14 |
+
Copyright © 2020–2024, NVIDIA Corporation. All rights reserved.
|
15 |
+
|
16 |
+
This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
|
17 |
+
|
18 |
+
For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
|
19 |
+
|
20 |
+
We do not currently accept outside code contributions in the form of pull requests.
|
21 |
+
|
22 |
+
Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
|
23 |
+
[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
|
24 |
+
originally shared under
|
25 |
+
[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
|
26 |
+
Mesh and texture stored as part of `samples/data/earth.npz` are derived from
|
27 |
+
[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
|
28 |
+
model originally made available under
|
29 |
+
[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
|
30 |
+
|
31 |
+
## Citation
|
32 |
+
|
33 |
+
```
|
34 |
+
@article{Laine2020diffrast,
|
35 |
+
title = {Modular Primitives for High-Performance Differentiable Rendering},
|
36 |
+
author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
|
37 |
+
journal = {ACM Transactions on Graphics},
|
38 |
+
year = {2020},
|
39 |
+
volume = {39},
|
40 |
+
number = {6}
|
41 |
+
}
|
42 |
+
```
|
extensions/extensions_nvdiffrast_run_sample.sh
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
6 |
+
# and proprietary rights in and to this software, related documentation
|
7 |
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
8 |
+
# distribution of this software and related documentation without an express
|
9 |
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
10 |
+
|
11 |
+
function print_help {
|
12 |
+
echo "Usage: `basename $0` [--build-container] <python_file>"
|
13 |
+
echo ""
|
14 |
+
echo "Option --build-container will build the Docker container based on"
|
15 |
+
echo "docker/Dockerfile and tag the image with gltorch:latest."
|
16 |
+
echo ""
|
17 |
+
echo "Example: `basename $0` samples/torch/envphong.py"
|
18 |
+
}
|
19 |
+
|
20 |
+
build_container=0
|
21 |
+
sample=""
|
22 |
+
while [[ "$#" -gt 0 ]]; do
|
23 |
+
case $1 in
|
24 |
+
--build-container) build_container=1;;
|
25 |
+
-h|--help) print_help; exit 0 ;;
|
26 |
+
--*) echo "Unknown parameter passed: $1"; exit 1 ;;
|
27 |
+
*) sample="$1"; shift; break;
|
28 |
+
esac
|
29 |
+
shift
|
30 |
+
done
|
31 |
+
|
32 |
+
rest=$@
|
33 |
+
|
34 |
+
# Build the docker container
|
35 |
+
if [ "$build_container" = "1" ]; then
|
36 |
+
docker build --tag gltorch:latest -f docker/Dockerfile .
|
37 |
+
fi
|
38 |
+
|
39 |
+
if [ ! -f "$sample" ]; then
|
40 |
+
echo
|
41 |
+
echo "No python sample given or file '$sample' not found. Exiting."
|
42 |
+
exit 1
|
43 |
+
fi
|
44 |
+
|
45 |
+
image="gltorch:latest"
|
46 |
+
|
47 |
+
echo "Using container image: $image"
|
48 |
+
echo "Running command: $sample $rest"
|
49 |
+
|
50 |
+
# Run a sample with docker
|
51 |
+
docker run --rm -it --gpus all --user $(id -u):$(id -g) \
|
52 |
+
-v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest
|
extensions/extensions_nvdiffrast_setup copy.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
# and proprietary rights in and to this software, related documentation
|
5 |
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
# distribution of this software and related documentation without an express
|
7 |
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
import nvdiffrast
|
10 |
+
import setuptools
|
11 |
+
import os
|
12 |
+
|
13 |
+
with open("README.md", "r") as fh:
|
14 |
+
long_description = fh.read()
|
15 |
+
|
16 |
+
setuptools.setup(
|
17 |
+
name="nvdiffrast",
|
18 |
+
version=nvdiffrast.__version__,
|
19 |
+
author="Samuli Laine",
|
20 |
+
author_email="slaine@nvidia.com",
|
21 |
+
description="nvdiffrast - modular primitives for high-performance differentiable rendering",
|
22 |
+
long_description=long_description,
|
23 |
+
long_description_content_type="text/markdown",
|
24 |
+
url="https://github.com/NVlabs/nvdiffrast",
|
25 |
+
packages=setuptools.find_packages(),
|
26 |
+
package_data={
|
27 |
+
'nvdiffrast': [
|
28 |
+
'common/*.h',
|
29 |
+
'common/*.inl',
|
30 |
+
'common/*.cu',
|
31 |
+
'common/*.cpp',
|
32 |
+
'common/cudaraster/*.hpp',
|
33 |
+
'common/cudaraster/impl/*.cpp',
|
34 |
+
'common/cudaraster/impl/*.hpp',
|
35 |
+
'common/cudaraster/impl/*.inl',
|
36 |
+
'common/cudaraster/impl/*.cu',
|
37 |
+
'lib/*.h',
|
38 |
+
'torch/*.h',
|
39 |
+
'torch/*.inl',
|
40 |
+
'torch/*.cpp',
|
41 |
+
'tensorflow/*.cu',
|
42 |
+
] + (['lib/*.lib'] if os.name == 'nt' else [])
|
43 |
+
},
|
44 |
+
include_package_data=True,
|
45 |
+
install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
|
46 |
+
classifiers=[
|
47 |
+
"Programming Language :: Python :: 3",
|
48 |
+
"Operating System :: OS Independent",
|
49 |
+
],
|
50 |
+
python_requires='>=3.6',
|
51 |
+
)
|
extensions/extensions_nvdiffrast_setup.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
# and proprietary rights in and to this software, related documentation
|
5 |
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
# distribution of this software and related documentation without an express
|
7 |
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
import nvdiffrast
|
10 |
+
import setuptools
|
11 |
+
import os
|
12 |
+
from torch.utils.cpp_extension import CUDAExtension, BuildExtension
|
13 |
+
|
14 |
+
|
15 |
+
with open("README.md", "r") as fh:
|
16 |
+
long_description = fh.read()
|
17 |
+
|
18 |
+
setuptools.setup(
|
19 |
+
name="nvdiffrast",
|
20 |
+
version=nvdiffrast.__version__,
|
21 |
+
author="Samuli Laine",
|
22 |
+
author_email="slaine@nvidia.com",
|
23 |
+
description="nvdiffrast - modular primitives for high-performance differentiable rendering",
|
24 |
+
long_description=long_description,
|
25 |
+
long_description_content_type="text/markdown",
|
26 |
+
url="https://github.com/NVlabs/nvdiffrast",
|
27 |
+
packages=setuptools.find_packages(),
|
28 |
+
# package_data={
|
29 |
+
# 'nvdiffrast': [
|
30 |
+
# 'common/*.h',
|
31 |
+
# 'common/*.inl',
|
32 |
+
# 'common/*.cu',
|
33 |
+
# 'common/*.cpp',
|
34 |
+
# 'common/cudaraster/*.hpp',
|
35 |
+
# 'common/cudaraster/impl/*.cpp',
|
36 |
+
# 'common/cudaraster/impl/*.hpp',
|
37 |
+
# 'common/cudaraster/impl/*.inl',
|
38 |
+
# 'common/cudaraster/impl/*.cu',
|
39 |
+
# 'lib/*.h',
|
40 |
+
# 'torch/*.h',
|
41 |
+
# 'torch/*.inl',
|
42 |
+
# 'torch/*.cpp',
|
43 |
+
# 'tensorflow/*.cu',
|
44 |
+
# ] + (['lib/*.lib'] if os.name == 'nt' else [])
|
45 |
+
# },
|
46 |
+
# include_package_data=True,
|
47 |
+
ext_modules=[
|
48 |
+
CUDAExtension(
|
49 |
+
name="nvdiffrast.torch._C",
|
50 |
+
sources=[
|
51 |
+
'nvdiffrast/common/cudaraster/impl/Buffer.cpp',
|
52 |
+
'nvdiffrast/common/cudaraster/impl/CudaRaster.cpp',
|
53 |
+
'nvdiffrast/common/cudaraster/impl/RasterImpl_.cu',
|
54 |
+
'nvdiffrast/common/cudaraster/impl/RasterImpl.cpp',
|
55 |
+
'nvdiffrast/common/common.cpp',
|
56 |
+
'nvdiffrast/common/rasterize.cu',
|
57 |
+
'nvdiffrast/common/interpolate.cu',
|
58 |
+
'nvdiffrast/common/texture_.cu',
|
59 |
+
'nvdiffrast/common/texture.cpp',
|
60 |
+
'nvdiffrast/common/antialias.cu',
|
61 |
+
'nvdiffrast/torch/torch_bindings.cpp',
|
62 |
+
'nvdiffrast/torch/torch_rasterize.cpp',
|
63 |
+
'nvdiffrast/torch/torch_interpolate.cpp',
|
64 |
+
'nvdiffrast/torch/torch_texture.cpp',
|
65 |
+
'nvdiffrast/torch/torch_antialias.cpp',
|
66 |
+
],
|
67 |
+
extra_compile_args={
|
68 |
+
'cxx': ['-DNVDR_TORCH'],
|
69 |
+
'nvcc': ['-DNVDR_TORCH', '-lineinfo'],
|
70 |
+
},
|
71 |
+
)
|
72 |
+
],
|
73 |
+
cmdclass={
|
74 |
+
'build_ext': BuildExtension
|
75 |
+
},
|
76 |
+
install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
|
77 |
+
classifiers=[
|
78 |
+
"Programming Language :: Python :: 3",
|
79 |
+
"Operating System :: OS Independent",
|
80 |
+
],
|
81 |
+
python_requires='>=3.6',
|
82 |
+
)
|
extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
|
11 |
+
//------------------------------------------------------------------------
|
12 |
+
// This is a slimmed-down and modernized version of the original
|
13 |
+
// CudaRaster codebase that accompanied the HPG 2011 paper
|
14 |
+
// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
|
15 |
+
// Modifications have been made to accommodate post-Volta execution model
|
16 |
+
// with warp divergence. Support for shading, blending, quad rendering,
|
17 |
+
// and supersampling have been removed as unnecessary for nvdiffrast.
|
18 |
+
//------------------------------------------------------------------------
|
19 |
+
|
20 |
+
namespace CR
|
21 |
+
{
|
22 |
+
|
23 |
+
class RasterImpl;
|
24 |
+
|
25 |
+
//------------------------------------------------------------------------
|
26 |
+
// Interface class to isolate user from implementation details.
|
27 |
+
//------------------------------------------------------------------------
|
28 |
+
|
29 |
+
class CudaRaster
|
30 |
+
{
|
31 |
+
public:
|
32 |
+
enum
|
33 |
+
{
|
34 |
+
RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
|
35 |
+
RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
|
36 |
+
};
|
37 |
+
|
38 |
+
public:
|
39 |
+
CudaRaster (void);
|
40 |
+
~CudaRaster (void);
|
41 |
+
|
42 |
+
void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
|
43 |
+
void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
|
44 |
+
void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
|
45 |
+
void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
|
46 |
+
void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
|
47 |
+
void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
|
48 |
+
bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
|
49 |
+
void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
|
50 |
+
void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
|
51 |
+
void swapDepthAndPeel (void); // Swap depth and peeling buffers.
|
52 |
+
|
53 |
+
private:
|
54 |
+
CudaRaster (const CudaRaster&); // forbidden
|
55 |
+
CudaRaster& operator= (const CudaRaster&); // forbidden
|
56 |
+
|
57 |
+
private:
|
58 |
+
RasterImpl* m_impl; // Opaque pointer to implementation.
|
59 |
+
};
|
60 |
+
|
61 |
+
//------------------------------------------------------------------------
|
62 |
+
} // namespace CR
|
63 |
+
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
//------------------------------------------------------------------------
|
10 |
+
|
11 |
+
__device__ __inline__ void binRasterImpl(const CRParams p)
|
12 |
+
{
|
13 |
+
__shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
|
14 |
+
__shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
|
15 |
+
__shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
|
16 |
+
__shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
|
17 |
+
__shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
|
18 |
+
__shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
|
19 |
+
__shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
|
20 |
+
__shared__ volatile U32 s_batchPos;
|
21 |
+
__shared__ volatile U32 s_bufCount;
|
22 |
+
__shared__ volatile U32 s_overTotal;
|
23 |
+
__shared__ volatile U32 s_allocBase;
|
24 |
+
|
25 |
+
const CRImageParams& ip = getImageParams(p, blockIdx.z);
|
26 |
+
CRAtomics& atomics = p.atomics[blockIdx.z];
|
27 |
+
const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
|
28 |
+
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
|
29 |
+
|
30 |
+
S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
31 |
+
S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
32 |
+
S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
|
33 |
+
S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
|
34 |
+
S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
|
35 |
+
|
36 |
+
if (atomics.numSubtris > p.maxSubtris)
|
37 |
+
return;
|
38 |
+
|
39 |
+
// per-thread state
|
40 |
+
int thrInBlock = threadIdx.x + threadIdx.y * 32;
|
41 |
+
int batchPos = 0;
|
42 |
+
|
43 |
+
// first 16 elements of s_broadcast are always zero
|
44 |
+
if (thrInBlock < 16)
|
45 |
+
s_broadcast[thrInBlock] = 0;
|
46 |
+
|
47 |
+
// initialize output linked lists and offsets
|
48 |
+
if (thrInBlock < p.numBins)
|
49 |
+
{
|
50 |
+
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
|
51 |
+
s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
|
52 |
+
s_outTotal[thrInBlock] = 0;
|
53 |
+
}
|
54 |
+
|
55 |
+
// repeat until done
|
56 |
+
for(;;)
|
57 |
+
{
|
58 |
+
// get batch
|
59 |
+
if (thrInBlock == 0)
|
60 |
+
s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
|
61 |
+
__syncthreads();
|
62 |
+
batchPos = s_batchPos;
|
63 |
+
|
64 |
+
// all batches done?
|
65 |
+
if (batchPos >= ip.triCount)
|
66 |
+
break;
|
67 |
+
|
68 |
+
// per-thread state
|
69 |
+
int bufIndex = 0;
|
70 |
+
int bufCount = 0;
|
71 |
+
int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
|
72 |
+
|
73 |
+
// loop over batch as long as we have triangles in it
|
74 |
+
do
|
75 |
+
{
|
76 |
+
// read more triangles
|
77 |
+
while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
|
78 |
+
{
|
79 |
+
// get subtriangle count
|
80 |
+
|
81 |
+
int triIdx = batchPos + thrInBlock;
|
82 |
+
int num = 0;
|
83 |
+
if (triIdx < batchEnd)
|
84 |
+
num = triSubtris[triIdx];
|
85 |
+
|
86 |
+
// cumulative sum of subtriangles within each warp
|
87 |
+
U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
|
88 |
+
if (__any_sync(~0u, num > 1))
|
89 |
+
{
|
90 |
+
myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
|
91 |
+
myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
|
92 |
+
}
|
93 |
+
if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
|
94 |
+
s_broadcast[threadIdx.y + 16] = myIdx + num;
|
95 |
+
__syncthreads();
|
96 |
+
|
97 |
+
// cumulative sum of per-warp subtriangle counts
|
98 |
+
// Note: cannot have more than 32 warps or this needs to sync between each step.
|
99 |
+
bool act = (thrInBlock < CR_BIN_WARPS);
|
100 |
+
U32 actMask = __ballot_sync(~0u, act);
|
101 |
+
if (threadIdx.y == 0 && act)
|
102 |
+
{
|
103 |
+
volatile U32* ptr = &s_broadcast[thrInBlock + 16];
|
104 |
+
U32 val = *ptr;
|
105 |
+
#if (CR_BIN_WARPS > 1)
|
106 |
+
val += ptr[-1]; __syncwarp(actMask);
|
107 |
+
*ptr = val; __syncwarp(actMask);
|
108 |
+
#endif
|
109 |
+
#if (CR_BIN_WARPS > 2)
|
110 |
+
val += ptr[-2]; __syncwarp(actMask);
|
111 |
+
*ptr = val; __syncwarp(actMask);
|
112 |
+
#endif
|
113 |
+
#if (CR_BIN_WARPS > 4)
|
114 |
+
val += ptr[-4]; __syncwarp(actMask);
|
115 |
+
*ptr = val; __syncwarp(actMask);
|
116 |
+
#endif
|
117 |
+
#if (CR_BIN_WARPS > 8)
|
118 |
+
val += ptr[-8]; __syncwarp(actMask);
|
119 |
+
*ptr = val; __syncwarp(actMask);
|
120 |
+
#endif
|
121 |
+
#if (CR_BIN_WARPS > 16)
|
122 |
+
val += ptr[-16]; __syncwarp(actMask);
|
123 |
+
*ptr = val; __syncwarp(actMask);
|
124 |
+
#endif
|
125 |
+
|
126 |
+
// initially assume that we consume everything
|
127 |
+
// only last active thread does the writes
|
128 |
+
if (threadIdx.x == CR_BIN_WARPS - 1)
|
129 |
+
{
|
130 |
+
s_batchPos = batchPos + CR_BIN_WARPS * 32;
|
131 |
+
s_bufCount = bufCount + val;
|
132 |
+
}
|
133 |
+
}
|
134 |
+
__syncthreads();
|
135 |
+
|
136 |
+
// skip if no subtriangles
|
137 |
+
if (num)
|
138 |
+
{
|
139 |
+
// calculate write position for first subtriangle
|
140 |
+
U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
|
141 |
+
|
142 |
+
// only write if entire triangle fits
|
143 |
+
if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
|
144 |
+
{
|
145 |
+
pos += bufIndex; // adjust for current start position
|
146 |
+
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
147 |
+
if (num == 1)
|
148 |
+
s_triBuf[pos] = triIdx * 8 + 7; // single triangle
|
149 |
+
else
|
150 |
+
{
|
151 |
+
for (int i=0; i < num; i++)
|
152 |
+
{
|
153 |
+
s_triBuf[pos] = triIdx * 8 + i;
|
154 |
+
pos++;
|
155 |
+
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
156 |
+
}
|
157 |
+
}
|
158 |
+
} else if (pos <= CR_ARRAY_SIZE(s_triBuf))
|
159 |
+
{
|
160 |
+
// this triangle is the first that failed, overwrite total count and triangle count
|
161 |
+
s_batchPos = batchPos + thrInBlock;
|
162 |
+
s_bufCount = pos;
|
163 |
+
}
|
164 |
+
}
|
165 |
+
|
166 |
+
// update triangle counts
|
167 |
+
__syncthreads();
|
168 |
+
batchPos = s_batchPos;
|
169 |
+
bufCount = s_bufCount;
|
170 |
+
}
|
171 |
+
|
172 |
+
// make every warp clear its output buffers
|
173 |
+
for (int i=threadIdx.x; i < p.numBins; i += 32)
|
174 |
+
s_outMask[threadIdx.y][i] = 0;
|
175 |
+
__syncwarp();
|
176 |
+
|
177 |
+
// choose our triangle
|
178 |
+
uint4 triData = make_uint4(0, 0, 0, 0);
|
179 |
+
if (thrInBlock < bufCount)
|
180 |
+
{
|
181 |
+
U32 triPos = bufIndex + thrInBlock;
|
182 |
+
triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
183 |
+
|
184 |
+
// find triangle
|
185 |
+
int triIdx = s_triBuf[triPos];
|
186 |
+
int dataIdx = triIdx >> 3;
|
187 |
+
int subtriIdx = triIdx & 7;
|
188 |
+
if (subtriIdx != 7)
|
189 |
+
dataIdx = triHeader[dataIdx].misc + subtriIdx;
|
190 |
+
|
191 |
+
// read triangle
|
192 |
+
|
193 |
+
triData = *(((const uint4*)triHeader) + dataIdx);
|
194 |
+
}
|
195 |
+
|
196 |
+
// setup bounding box and edge functions, and rasterize
|
197 |
+
S32 lox, loy, hix, hiy;
|
198 |
+
bool hasTri = (thrInBlock < bufCount);
|
199 |
+
U32 hasTriMask = __ballot_sync(~0u, hasTri);
|
200 |
+
if (hasTri)
|
201 |
+
{
|
202 |
+
S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
|
203 |
+
S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
|
204 |
+
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
|
205 |
+
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
|
206 |
+
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
|
207 |
+
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
|
208 |
+
int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
|
209 |
+
lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
|
210 |
+
loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
|
211 |
+
hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
|
212 |
+
hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
|
213 |
+
|
214 |
+
U32 bit = 1 << threadIdx.x;
|
215 |
+
#if __CUDA_ARCH__ >= 700
|
216 |
+
bool multi = (hix != lox || hiy != loy);
|
217 |
+
if (!__any_sync(hasTriMask, multi))
|
218 |
+
{
|
219 |
+
int binIdx = lox + p.widthBins * loy;
|
220 |
+
U32 mask = __match_any_sync(hasTriMask, binIdx);
|
221 |
+
s_outMask[threadIdx.y][binIdx] = mask;
|
222 |
+
__syncwarp(hasTriMask);
|
223 |
+
} else
|
224 |
+
#endif
|
225 |
+
{
|
226 |
+
bool complex = (hix > lox+1 || hiy > loy+1);
|
227 |
+
if (!__any_sync(hasTriMask, complex))
|
228 |
+
{
|
229 |
+
int binIdx = lox + p.widthBins * loy;
|
230 |
+
atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
|
231 |
+
if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
|
232 |
+
if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
|
233 |
+
if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
|
234 |
+
} else
|
235 |
+
{
|
236 |
+
S32 d12x = d02x - d01x, d12y = d02y - d01y;
|
237 |
+
v0x -= lox << binLog, v0y -= loy << binLog;
|
238 |
+
|
239 |
+
S32 t01 = v0x * d01y - v0y * d01x;
|
240 |
+
S32 t02 = v0y * d02x - v0x * d02y;
|
241 |
+
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
|
242 |
+
S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
|
243 |
+
S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
|
244 |
+
S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
|
245 |
+
|
246 |
+
int width = hix - lox + 1;
|
247 |
+
d01x += width * d01y;
|
248 |
+
d02x += width * d02y;
|
249 |
+
d12x += width * d12y;
|
250 |
+
|
251 |
+
U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
|
252 |
+
U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
|
253 |
+
U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
|
254 |
+
int stride = p.widthBins * 4;
|
255 |
+
int ptrYInc = stride - width * 4;
|
256 |
+
|
257 |
+
do
|
258 |
+
{
|
259 |
+
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
|
260 |
+
atomicOr((U32*)currPtr, bit);
|
261 |
+
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
262 |
+
if (currPtr == skipPtr)
|
263 |
+
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
|
264 |
+
}
|
265 |
+
while (currPtr != endPtr);
|
266 |
+
}
|
267 |
+
}
|
268 |
+
}
|
269 |
+
|
270 |
+
// count per-bin contributions
|
271 |
+
if (thrInBlock == 0)
|
272 |
+
s_overTotal = 0; // overflow counter
|
273 |
+
|
274 |
+
// ensure that out masks are done
|
275 |
+
__syncthreads();
|
276 |
+
|
277 |
+
int overIndex = -1;
|
278 |
+
bool act = (thrInBlock < p.numBins);
|
279 |
+
U32 actMask = __ballot_sync(~0u, act);
|
280 |
+
if (act)
|
281 |
+
{
|
282 |
+
U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
|
283 |
+
U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
|
284 |
+
int total = 0;
|
285 |
+
for (int i = 0; i < CR_BIN_WARPS; i++)
|
286 |
+
{
|
287 |
+
total += __popc(*(U32*)srcPtr);
|
288 |
+
*(U32*)dstPtr = total;
|
289 |
+
srcPtr += (CR_MAXBINS_SQR + 1) * 4;
|
290 |
+
dstPtr += (CR_MAXBINS_SQR + 1) * 4;
|
291 |
+
}
|
292 |
+
|
293 |
+
// overflow => request a new segment
|
294 |
+
int ofs = s_outOfs[thrInBlock];
|
295 |
+
bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
|
296 |
+
U32 ovrMask = __ballot_sync(actMask, ovr);
|
297 |
+
if (ovr)
|
298 |
+
{
|
299 |
+
overIndex = __popc(ovrMask & getLaneMaskLt());
|
300 |
+
if (overIndex == 0)
|
301 |
+
s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
|
302 |
+
__syncwarp(ovrMask);
|
303 |
+
overIndex += s_broadcast[threadIdx.y + 16];
|
304 |
+
s_overIndex[thrInBlock] = overIndex;
|
305 |
+
}
|
306 |
+
}
|
307 |
+
|
308 |
+
// sync after overTotal is ready
|
309 |
+
__syncthreads();
|
310 |
+
|
311 |
+
// at least one segment overflowed => allocate segments
|
312 |
+
U32 overTotal = s_overTotal;
|
313 |
+
U32 allocBase = 0;
|
314 |
+
if (overTotal > 0)
|
315 |
+
{
|
316 |
+
// allocate memory
|
317 |
+
if (thrInBlock == 0)
|
318 |
+
{
|
319 |
+
U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
|
320 |
+
s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
|
321 |
+
}
|
322 |
+
__syncthreads();
|
323 |
+
allocBase = s_allocBase;
|
324 |
+
|
325 |
+
// did my bin overflow?
|
326 |
+
if (overIndex != -1)
|
327 |
+
{
|
328 |
+
// calculate new segment index
|
329 |
+
int segIdx = allocBase + overIndex;
|
330 |
+
|
331 |
+
// add to linked list
|
332 |
+
if (s_outOfs[thrInBlock] < 0)
|
333 |
+
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
|
334 |
+
else
|
335 |
+
binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
|
336 |
+
|
337 |
+
// defaults
|
338 |
+
binSegNext [segIdx] = -1;
|
339 |
+
binSegCount[segIdx] = CR_BIN_SEG_SIZE;
|
340 |
+
}
|
341 |
+
}
|
342 |
+
|
343 |
+
// concurrent emission -- each warp handles its own triangle
|
344 |
+
if (thrInBlock < bufCount)
|
345 |
+
{
|
346 |
+
int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
|
347 |
+
int currBin = lox + loy * p.widthBins;
|
348 |
+
int skipBin = (hix + 1) + loy * p.widthBins;
|
349 |
+
int endBin = lox + (hiy + 1) * p.widthBins;
|
350 |
+
int binYInc = p.widthBins - (hix - lox + 1);
|
351 |
+
|
352 |
+
// loop over triangle's bins
|
353 |
+
do
|
354 |
+
{
|
355 |
+
U32 outMask = s_outMask[threadIdx.y][currBin];
|
356 |
+
if (outMask & (1<<threadIdx.x))
|
357 |
+
{
|
358 |
+
int idx = __popc(outMask & getLaneMaskLt());
|
359 |
+
if (threadIdx.y > 0)
|
360 |
+
idx += s_outCount[threadIdx.y-1][currBin];
|
361 |
+
|
362 |
+
int base = s_outOfs[currBin];
|
363 |
+
int free = (-base) & (CR_BIN_SEG_SIZE - 1);
|
364 |
+
if (idx >= free)
|
365 |
+
idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
|
366 |
+
else
|
367 |
+
idx += base;
|
368 |
+
|
369 |
+
binSegData[idx] = s_triBuf[triPos];
|
370 |
+
}
|
371 |
+
|
372 |
+
currBin++;
|
373 |
+
if (currBin == skipBin)
|
374 |
+
currBin += binYInc, skipBin += p.widthBins;
|
375 |
+
}
|
376 |
+
while (currBin != endBin);
|
377 |
+
}
|
378 |
+
|
379 |
+
// wait all triangles to finish, then replace overflown segment offsets
|
380 |
+
__syncthreads();
|
381 |
+
if (thrInBlock < p.numBins)
|
382 |
+
{
|
383 |
+
U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
|
384 |
+
U32 oldOfs = s_outOfs[thrInBlock];
|
385 |
+
if (overIndex == -1)
|
386 |
+
s_outOfs[thrInBlock] = oldOfs + total;
|
387 |
+
else
|
388 |
+
{
|
389 |
+
int addr = oldOfs + total;
|
390 |
+
addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
|
391 |
+
addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
|
392 |
+
s_outOfs[thrInBlock] = addr;
|
393 |
+
}
|
394 |
+
s_outTotal[thrInBlock] += total;
|
395 |
+
}
|
396 |
+
|
397 |
+
// these triangles are now done
|
398 |
+
int count = ::min(bufCount, CR_BIN_WARPS * 32);
|
399 |
+
bufCount -= count;
|
400 |
+
bufIndex += count;
|
401 |
+
bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
|
402 |
+
}
|
403 |
+
while (bufCount > 0 || batchPos < batchEnd);
|
404 |
+
|
405 |
+
// flush all bins
|
406 |
+
if (thrInBlock < p.numBins)
|
407 |
+
{
|
408 |
+
int ofs = s_outOfs[thrInBlock];
|
409 |
+
if (ofs & (CR_BIN_SEG_SIZE-1))
|
410 |
+
{
|
411 |
+
int seg = ofs >> CR_BIN_SEG_LOG2;
|
412 |
+
binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
|
413 |
+
s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
|
414 |
+
}
|
415 |
+
}
|
416 |
+
}
|
417 |
+
|
418 |
+
// output totals
|
419 |
+
if (thrInBlock < p.numBins)
|
420 |
+
binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
|
421 |
+
}
|
422 |
+
|
423 |
+
//------------------------------------------------------------------------
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#include "../../framework.h"
|
10 |
+
#include "Buffer.hpp"
|
11 |
+
|
12 |
+
using namespace CR;
|
13 |
+
|
14 |
+
//------------------------------------------------------------------------
|
15 |
+
// GPU buffer.
|
16 |
+
//------------------------------------------------------------------------
|
17 |
+
|
18 |
+
Buffer::Buffer(void)
|
19 |
+
: m_gpuPtr(NULL),
|
20 |
+
m_bytes (0)
|
21 |
+
{
|
22 |
+
// empty
|
23 |
+
}
|
24 |
+
|
25 |
+
Buffer::~Buffer(void)
|
26 |
+
{
|
27 |
+
if (m_gpuPtr)
|
28 |
+
cudaFree(m_gpuPtr); // Don't throw an exception.
|
29 |
+
}
|
30 |
+
|
31 |
+
void Buffer::reset(size_t bytes)
|
32 |
+
{
|
33 |
+
if (bytes == m_bytes)
|
34 |
+
return;
|
35 |
+
|
36 |
+
if (m_gpuPtr)
|
37 |
+
{
|
38 |
+
NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
|
39 |
+
m_gpuPtr = NULL;
|
40 |
+
}
|
41 |
+
|
42 |
+
if (bytes > 0)
|
43 |
+
NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
|
44 |
+
|
45 |
+
m_bytes = bytes;
|
46 |
+
}
|
47 |
+
|
48 |
+
void Buffer::grow(size_t bytes)
|
49 |
+
{
|
50 |
+
if (bytes > m_bytes)
|
51 |
+
reset(bytes);
|
52 |
+
}
|
53 |
+
|
54 |
+
//------------------------------------------------------------------------
|
55 |
+
// Host buffer with page-locked memory.
|
56 |
+
//------------------------------------------------------------------------
|
57 |
+
|
58 |
+
HostBuffer::HostBuffer(void)
|
59 |
+
: m_hostPtr(NULL),
|
60 |
+
m_bytes (0)
|
61 |
+
{
|
62 |
+
// empty
|
63 |
+
}
|
64 |
+
|
65 |
+
HostBuffer::~HostBuffer(void)
|
66 |
+
{
|
67 |
+
if (m_hostPtr)
|
68 |
+
cudaFreeHost(m_hostPtr); // Don't throw an exception.
|
69 |
+
}
|
70 |
+
|
71 |
+
void HostBuffer::reset(size_t bytes)
|
72 |
+
{
|
73 |
+
if (bytes == m_bytes)
|
74 |
+
return;
|
75 |
+
|
76 |
+
if (m_hostPtr)
|
77 |
+
{
|
78 |
+
NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
|
79 |
+
m_hostPtr = NULL;
|
80 |
+
}
|
81 |
+
|
82 |
+
if (bytes > 0)
|
83 |
+
NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
|
84 |
+
|
85 |
+
m_bytes = bytes;
|
86 |
+
}
|
87 |
+
|
88 |
+
void HostBuffer::grow(size_t bytes)
|
89 |
+
{
|
90 |
+
if (bytes > m_bytes)
|
91 |
+
reset(bytes);
|
92 |
+
}
|
93 |
+
|
94 |
+
//------------------------------------------------------------------------
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
#include "Defs.hpp"
|
11 |
+
|
12 |
+
namespace CR
|
13 |
+
{
|
14 |
+
//------------------------------------------------------------------------
|
15 |
+
|
16 |
+
class Buffer
|
17 |
+
{
|
18 |
+
public:
|
19 |
+
Buffer (void);
|
20 |
+
~Buffer (void);
|
21 |
+
|
22 |
+
void reset (size_t bytes);
|
23 |
+
void grow (size_t bytes);
|
24 |
+
void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
|
25 |
+
size_t getSize (void) const { return m_bytes; }
|
26 |
+
|
27 |
+
void setPtr (void* ptr) { m_gpuPtr = ptr; }
|
28 |
+
|
29 |
+
private:
|
30 |
+
void* m_gpuPtr;
|
31 |
+
size_t m_bytes;
|
32 |
+
};
|
33 |
+
|
34 |
+
//------------------------------------------------------------------------
|
35 |
+
|
36 |
+
class HostBuffer
|
37 |
+
{
|
38 |
+
public:
|
39 |
+
HostBuffer (void);
|
40 |
+
~HostBuffer (void);
|
41 |
+
|
42 |
+
void reset (size_t bytes);
|
43 |
+
void grow (size_t bytes);
|
44 |
+
void* getPtr (void) { return m_hostPtr; }
|
45 |
+
size_t getSize (void) const { return m_bytes; }
|
46 |
+
|
47 |
+
void setPtr (void* ptr) { m_hostPtr = ptr; }
|
48 |
+
|
49 |
+
private:
|
50 |
+
void* m_hostPtr;
|
51 |
+
size_t m_bytes;
|
52 |
+
};
|
53 |
+
|
54 |
+
//------------------------------------------------------------------------
|
55 |
+
}
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl
ADDED
@@ -0,0 +1,730 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
//------------------------------------------------------------------------
|
10 |
+
|
11 |
+
__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
|
12 |
+
{
|
13 |
+
int tileX = tileInBin & (CR_BIN_SIZE - 1);
|
14 |
+
int tileY = tileInBin >> CR_BIN_LOG2;
|
15 |
+
return tileX + tileY * widthTiles;
|
16 |
+
}
|
17 |
+
|
18 |
+
//------------------------------------------------------------------------
|
19 |
+
|
20 |
+
__device__ __inline__ void coarseRasterImpl(const CRParams p)
|
21 |
+
{
|
22 |
+
// Common.
|
23 |
+
|
24 |
+
__shared__ volatile U32 s_workCounter;
|
25 |
+
__shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
|
26 |
+
|
27 |
+
// Input.
|
28 |
+
|
29 |
+
__shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
|
30 |
+
__shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
|
31 |
+
__shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
|
32 |
+
__shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
|
33 |
+
__shared__ volatile S32 s_triQueueWritePos;
|
34 |
+
__shared__ volatile U32 s_binStreamSelectedOfs;
|
35 |
+
__shared__ volatile U32 s_binStreamSelectedSize;
|
36 |
+
|
37 |
+
// Output.
|
38 |
+
|
39 |
+
__shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
|
40 |
+
__shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
|
41 |
+
__shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
|
42 |
+
__shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
|
43 |
+
__shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
|
44 |
+
__shared__ volatile U32 s_firstAllocSeg;
|
45 |
+
__shared__ volatile U32 s_firstActiveIdx;
|
46 |
+
|
47 |
+
// Pointers and constants.
|
48 |
+
|
49 |
+
CRAtomics& atomics = p.atomics[blockIdx.z];
|
50 |
+
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
|
51 |
+
const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
52 |
+
const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
53 |
+
const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
|
54 |
+
const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
|
55 |
+
const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
|
56 |
+
S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
|
57 |
+
S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
|
58 |
+
S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
|
59 |
+
S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
|
60 |
+
S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
|
61 |
+
|
62 |
+
int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
|
63 |
+
int thrInBlock = threadIdx.x + threadIdx.y * 32;
|
64 |
+
int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
|
65 |
+
|
66 |
+
if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
|
67 |
+
return;
|
68 |
+
|
69 |
+
// Initialize sharedmem arrays.
|
70 |
+
|
71 |
+
if (thrInBlock == 0)
|
72 |
+
{
|
73 |
+
s_tileEmitPrefixSum[0] = 0;
|
74 |
+
s_tileAllocPrefixSum[0] = 0;
|
75 |
+
}
|
76 |
+
s_scanTemp[threadIdx.y][threadIdx.x] = 0;
|
77 |
+
|
78 |
+
// Sort bins in descending order of triangle count.
|
79 |
+
|
80 |
+
for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
|
81 |
+
{
|
82 |
+
int count = 0;
|
83 |
+
for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
|
84 |
+
count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
|
85 |
+
s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
|
86 |
+
}
|
87 |
+
|
88 |
+
__syncthreads();
|
89 |
+
sortShared(s_binOrder, p.numBins);
|
90 |
+
|
91 |
+
// Process each bin by one block.
|
92 |
+
|
93 |
+
for (;;)
|
94 |
+
{
|
95 |
+
// Pick a bin for the block.
|
96 |
+
|
97 |
+
if (thrInBlock == 0)
|
98 |
+
s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
|
99 |
+
__syncthreads();
|
100 |
+
|
101 |
+
int workCounter = s_workCounter;
|
102 |
+
if (workCounter >= p.numBins)
|
103 |
+
break;
|
104 |
+
|
105 |
+
U32 binOrder = s_binOrder[workCounter];
|
106 |
+
bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
|
107 |
+
if (binEmpty && !p.deferredClear)
|
108 |
+
break;
|
109 |
+
|
110 |
+
int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
|
111 |
+
|
112 |
+
// Initialize input/output streams.
|
113 |
+
|
114 |
+
int triQueueWritePos = 0;
|
115 |
+
int triQueueReadPos = 0;
|
116 |
+
|
117 |
+
if (thrInBlock < CR_BIN_STREAMS_SIZE)
|
118 |
+
{
|
119 |
+
int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
|
120 |
+
s_binStreamCurrSeg[thrInBlock] = segIdx;
|
121 |
+
s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
|
122 |
+
}
|
123 |
+
|
124 |
+
for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
125 |
+
s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
|
126 |
+
|
127 |
+
// Initialize per-bin state.
|
128 |
+
|
129 |
+
int binY = idiv_fast(binIdx, p.widthBins);
|
130 |
+
int binX = binIdx - binY * p.widthBins;
|
131 |
+
int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
132 |
+
int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
133 |
+
int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
|
134 |
+
int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
|
135 |
+
int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
|
136 |
+
|
137 |
+
// Entire block: Merge input streams and process triangles.
|
138 |
+
|
139 |
+
if (!binEmpty)
|
140 |
+
do
|
141 |
+
{
|
142 |
+
//------------------------------------------------------------------------
|
143 |
+
// Merge.
|
144 |
+
//------------------------------------------------------------------------
|
145 |
+
|
146 |
+
// Entire block: Not enough triangles => merge and queue segments.
|
147 |
+
// NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
|
148 |
+
|
149 |
+
while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
|
150 |
+
{
|
151 |
+
// First warp: Choose the segment with the lowest initial triangle index.
|
152 |
+
|
153 |
+
bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
|
154 |
+
U32 hasStreamMask = __ballot_sync(~0u, hasStream);
|
155 |
+
if (hasStream)
|
156 |
+
{
|
157 |
+
// Find the stream with the lowest triangle index.
|
158 |
+
|
159 |
+
U32 firstTri = s_binStreamFirstTri[thrInBlock];
|
160 |
+
U32 t = firstTri;
|
161 |
+
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
162 |
+
|
163 |
+
#if (CR_BIN_STREAMS_SIZE > 1)
|
164 |
+
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
|
165 |
+
#endif
|
166 |
+
#if (CR_BIN_STREAMS_SIZE > 2)
|
167 |
+
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
|
168 |
+
#endif
|
169 |
+
#if (CR_BIN_STREAMS_SIZE > 4)
|
170 |
+
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
|
171 |
+
#endif
|
172 |
+
#if (CR_BIN_STREAMS_SIZE > 8)
|
173 |
+
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
|
174 |
+
#endif
|
175 |
+
#if (CR_BIN_STREAMS_SIZE > 16)
|
176 |
+
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
|
177 |
+
#endif
|
178 |
+
v[0] = t; __syncwarp(hasStreamMask);
|
179 |
+
|
180 |
+
// Consume and broadcast.
|
181 |
+
|
182 |
+
bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
|
183 |
+
U32 firstMask = __ballot_sync(hasStreamMask, first);
|
184 |
+
if (first && (firstMask >> threadIdx.x) == 1u)
|
185 |
+
{
|
186 |
+
int segIdx = s_binStreamCurrSeg[thrInBlock];
|
187 |
+
s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
|
188 |
+
if (segIdx != -1)
|
189 |
+
{
|
190 |
+
int segSize = binSegCount[segIdx];
|
191 |
+
int segNext = binSegNext[segIdx];
|
192 |
+
s_binStreamSelectedSize = segSize;
|
193 |
+
s_triQueueWritePos = triQueueWritePos + segSize;
|
194 |
+
s_binStreamCurrSeg[thrInBlock] = segNext;
|
195 |
+
s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
|
196 |
+
}
|
197 |
+
}
|
198 |
+
}
|
199 |
+
|
200 |
+
// No more segments => break.
|
201 |
+
|
202 |
+
__syncthreads();
|
203 |
+
triQueueWritePos = s_triQueueWritePos;
|
204 |
+
int segOfs = s_binStreamSelectedOfs;
|
205 |
+
if (segOfs < 0)
|
206 |
+
break;
|
207 |
+
|
208 |
+
int segSize = s_binStreamSelectedSize;
|
209 |
+
__syncthreads();
|
210 |
+
|
211 |
+
// Fetch triangles into the queue.
|
212 |
+
|
213 |
+
for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
|
214 |
+
{
|
215 |
+
S32 triIdx = binSegData[segOfs + idxInSeg];
|
216 |
+
s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
|
217 |
+
}
|
218 |
+
}
|
219 |
+
|
220 |
+
// All threads: Clear emit masks.
|
221 |
+
|
222 |
+
for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
|
223 |
+
s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
|
224 |
+
|
225 |
+
__syncthreads();
|
226 |
+
|
227 |
+
//------------------------------------------------------------------------
|
228 |
+
// Raster.
|
229 |
+
//------------------------------------------------------------------------
|
230 |
+
|
231 |
+
// Triangle per thread: Read from the queue.
|
232 |
+
|
233 |
+
int triIdx = -1;
|
234 |
+
if (triQueueReadPos + thrInBlock < triQueueWritePos)
|
235 |
+
triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
|
236 |
+
|
237 |
+
uint4 triData = make_uint4(0, 0, 0, 0);
|
238 |
+
if (triIdx != -1)
|
239 |
+
{
|
240 |
+
int dataIdx = triIdx >> 3;
|
241 |
+
int subtriIdx = triIdx & 7;
|
242 |
+
if (subtriIdx != 7)
|
243 |
+
dataIdx = triHeader[dataIdx].misc + subtriIdx;
|
244 |
+
triData = *((uint4*)triHeader + dataIdx);
|
245 |
+
}
|
246 |
+
|
247 |
+
// 32 triangles per warp: Record emits (= tile intersections).
|
248 |
+
|
249 |
+
if (__any_sync(~0u, triIdx != -1))
|
250 |
+
{
|
251 |
+
S32 v0x = sub_s16lo_s16lo(triData.x, originX);
|
252 |
+
S32 v0y = sub_s16hi_s16lo(triData.x, originY);
|
253 |
+
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
|
254 |
+
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
|
255 |
+
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
|
256 |
+
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
|
257 |
+
|
258 |
+
// Compute tile-based AABB.
|
259 |
+
|
260 |
+
int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
|
261 |
+
int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
|
262 |
+
int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
|
263 |
+
int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
|
264 |
+
int sizex = add_sub(hix, 1, lox);
|
265 |
+
int sizey = add_sub(hiy, 1, loy);
|
266 |
+
int area = sizex * sizey;
|
267 |
+
|
268 |
+
// Miscellaneous init.
|
269 |
+
|
270 |
+
U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
|
271 |
+
int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
|
272 |
+
U32 maskBit = 1 << threadIdx.x;
|
273 |
+
|
274 |
+
// Case A: All AABBs are small => record the full AABB using atomics.
|
275 |
+
|
276 |
+
if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
|
277 |
+
{
|
278 |
+
if (triIdx != -1)
|
279 |
+
{
|
280 |
+
atomicOr((U32*)currPtr, maskBit);
|
281 |
+
if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
|
282 |
+
if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
|
283 |
+
if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
|
284 |
+
}
|
285 |
+
}
|
286 |
+
else
|
287 |
+
{
|
288 |
+
// Compute warp-AABB (scan-32).
|
289 |
+
|
290 |
+
U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
|
291 |
+
if (triIdx == -1)
|
292 |
+
aabbMask = 0;
|
293 |
+
|
294 |
+
volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
|
295 |
+
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
|
296 |
+
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
|
297 |
+
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
|
298 |
+
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
|
299 |
+
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
|
300 |
+
v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
|
301 |
+
|
302 |
+
U32 maskX = aabbMask & 0xFFFF;
|
303 |
+
U32 maskY = aabbMask >> 16;
|
304 |
+
int wlox = findLeadingOne(maskX ^ (maskX - 1));
|
305 |
+
int wloy = findLeadingOne(maskY ^ (maskY - 1));
|
306 |
+
int whix = findLeadingOne(maskX);
|
307 |
+
int whiy = findLeadingOne(maskY);
|
308 |
+
int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
|
309 |
+
|
310 |
+
// Initialize edge functions.
|
311 |
+
|
312 |
+
S32 d12x = d02x - d01x;
|
313 |
+
S32 d12y = d02y - d01y;
|
314 |
+
v0x -= lox << tileLog;
|
315 |
+
v0y -= loy << tileLog;
|
316 |
+
|
317 |
+
S32 t01 = v0x * d01y - v0y * d01x;
|
318 |
+
S32 t02 = v0y * d02x - v0x * d02y;
|
319 |
+
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
|
320 |
+
S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
|
321 |
+
S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
|
322 |
+
S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
|
323 |
+
|
324 |
+
d01x += sizex * d01y;
|
325 |
+
d02x += sizex * d02y;
|
326 |
+
d12x += sizex * d12y;
|
327 |
+
|
328 |
+
// Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
|
329 |
+
if (__any_sync(~0u, warea * 4 <= area * 8))
|
330 |
+
{
|
331 |
+
// Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
|
332 |
+
bool act = (triIdx != -1);
|
333 |
+
U32 actMask = __ballot_sync(~0u, act);
|
334 |
+
if (act)
|
335 |
+
{
|
336 |
+
for (int y = wloy; y <= whiy; y++)
|
337 |
+
{
|
338 |
+
bool yIn = (y >= loy && y <= hiy);
|
339 |
+
U32 yMask = __ballot_sync(actMask, yIn);
|
340 |
+
if (yIn)
|
341 |
+
{
|
342 |
+
for (int x = wlox; x <= whix; x++)
|
343 |
+
{
|
344 |
+
bool xyIn = (x >= lox && x <= hix);
|
345 |
+
U32 xyMask = __ballot_sync(yMask, xyIn);
|
346 |
+
if (xyIn)
|
347 |
+
{
|
348 |
+
U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
|
349 |
+
if (threadIdx.x == 31 - __clz(xyMask))
|
350 |
+
*(U32*)currPtr = res;
|
351 |
+
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
352 |
+
}
|
353 |
+
}
|
354 |
+
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
|
355 |
+
}
|
356 |
+
}
|
357 |
+
}
|
358 |
+
}
|
359 |
+
|
360 |
+
// Case C: General case => Check tiles in AABB, record using atomics.
|
361 |
+
|
362 |
+
else
|
363 |
+
{
|
364 |
+
if (triIdx != -1)
|
365 |
+
{
|
366 |
+
U8* skipPtr = currPtr + (sizex << 2);
|
367 |
+
U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
|
368 |
+
do
|
369 |
+
{
|
370 |
+
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
|
371 |
+
atomicOr((U32*)currPtr, maskBit);
|
372 |
+
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
373 |
+
if (currPtr == skipPtr)
|
374 |
+
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
|
375 |
+
}
|
376 |
+
while (currPtr != endPtr);
|
377 |
+
}
|
378 |
+
}
|
379 |
+
}
|
380 |
+
}
|
381 |
+
|
382 |
+
__syncthreads();
|
383 |
+
|
384 |
+
//------------------------------------------------------------------------
|
385 |
+
// Count.
|
386 |
+
//------------------------------------------------------------------------
|
387 |
+
|
388 |
+
// Tile per thread: Initialize prefix sums.
|
389 |
+
|
390 |
+
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
391 |
+
{
|
392 |
+
int tileInBin = tileInBin_base + thrInBlock;
|
393 |
+
bool act = (tileInBin < CR_BIN_SQR);
|
394 |
+
U32 actMask = __ballot_sync(~0u, act);
|
395 |
+
if (act)
|
396 |
+
{
|
397 |
+
// Compute prefix sum of emits over warps.
|
398 |
+
|
399 |
+
U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
|
400 |
+
U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
|
401 |
+
int tileEmits = 0;
|
402 |
+
for (int i = 0; i < CR_COARSE_WARPS; i++)
|
403 |
+
{
|
404 |
+
tileEmits += __popc(*(U32*)srcPtr);
|
405 |
+
*(U32*)dstPtr = tileEmits;
|
406 |
+
srcPtr += (CR_BIN_SQR + 1) * 4;
|
407 |
+
dstPtr += (CR_BIN_SQR + 1) * 4;
|
408 |
+
}
|
409 |
+
|
410 |
+
// Determine the number of segments to allocate.
|
411 |
+
|
412 |
+
int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
|
413 |
+
int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
|
414 |
+
volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
|
415 |
+
|
416 |
+
// All counters within the warp are small => compute prefix sum using ballot.
|
417 |
+
|
418 |
+
if (!__any_sync(actMask, tileEmits >= 2))
|
419 |
+
{
|
420 |
+
U32 m = getLaneMaskLe();
|
421 |
+
*v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
|
422 |
+
}
|
423 |
+
|
424 |
+
// Otherwise => scan-32 within the warp.
|
425 |
+
|
426 |
+
else
|
427 |
+
{
|
428 |
+
U32 sum = (tileEmits << emitShift) | tileAllocs;
|
429 |
+
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
|
430 |
+
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
|
431 |
+
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
|
432 |
+
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
|
433 |
+
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
|
434 |
+
*v = sum; __syncwarp(actMask);
|
435 |
+
}
|
436 |
+
}
|
437 |
+
}
|
438 |
+
|
439 |
+
// First warp: Scan-8.
|
440 |
+
|
441 |
+
__syncthreads();
|
442 |
+
|
443 |
+
bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
|
444 |
+
U32 scan8Mask = __ballot_sync(~0u, scan8);
|
445 |
+
if (scan8)
|
446 |
+
{
|
447 |
+
int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
|
448 |
+
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
449 |
+
v[0] = sum; __syncwarp(scan8Mask);
|
450 |
+
#if (CR_BIN_SQR > 1 * 32)
|
451 |
+
sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
452 |
+
#endif
|
453 |
+
#if (CR_BIN_SQR > 2 * 32)
|
454 |
+
sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
455 |
+
#endif
|
456 |
+
#if (CR_BIN_SQR > 4 * 32)
|
457 |
+
sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
458 |
+
#endif
|
459 |
+
}
|
460 |
+
|
461 |
+
__syncthreads();
|
462 |
+
|
463 |
+
// Tile per thread: Finalize prefix sums.
|
464 |
+
// Single thread: Allocate segments.
|
465 |
+
|
466 |
+
for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
467 |
+
{
|
468 |
+
int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
|
469 |
+
int numEmits = sum >> emitShift;
|
470 |
+
int numAllocs = sum & ((1 << emitShift) - 1);
|
471 |
+
s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
|
472 |
+
s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
|
473 |
+
|
474 |
+
if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
|
475 |
+
{
|
476 |
+
int t = atomicAdd(&atomics.numTileSegs, numAllocs);
|
477 |
+
s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
|
478 |
+
}
|
479 |
+
}
|
480 |
+
|
481 |
+
__syncthreads();
|
482 |
+
int firstAllocSeg = s_firstAllocSeg;
|
483 |
+
int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
|
484 |
+
int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
|
485 |
+
|
486 |
+
//------------------------------------------------------------------------
|
487 |
+
// Emit.
|
488 |
+
//------------------------------------------------------------------------
|
489 |
+
|
490 |
+
// Emit per thread: Write triangle index to globalmem.
|
491 |
+
|
492 |
+
for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
|
493 |
+
{
|
494 |
+
// Find tile in bin.
|
495 |
+
|
496 |
+
U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
|
497 |
+
U8* tilePtr = tileBase;
|
498 |
+
U8* ptr;
|
499 |
+
|
500 |
+
#if (CR_BIN_SQR > 128)
|
501 |
+
ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
502 |
+
#endif
|
503 |
+
#if (CR_BIN_SQR > 64)
|
504 |
+
ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
505 |
+
#endif
|
506 |
+
#if (CR_BIN_SQR > 32)
|
507 |
+
ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
508 |
+
#endif
|
509 |
+
#if (CR_BIN_SQR > 16)
|
510 |
+
ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
511 |
+
#endif
|
512 |
+
#if (CR_BIN_SQR > 8)
|
513 |
+
ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
514 |
+
#endif
|
515 |
+
#if (CR_BIN_SQR > 4)
|
516 |
+
ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
517 |
+
#endif
|
518 |
+
#if (CR_BIN_SQR > 2)
|
519 |
+
ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
520 |
+
#endif
|
521 |
+
#if (CR_BIN_SQR > 1)
|
522 |
+
ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
523 |
+
#endif
|
524 |
+
|
525 |
+
int tileInBin = (tilePtr - tileBase) >> 2;
|
526 |
+
int emitInTile = emitInBin - *(U32*)tilePtr;
|
527 |
+
|
528 |
+
// Find warp in tile.
|
529 |
+
|
530 |
+
int warpStep = (CR_BIN_SQR + 1) * 4;
|
531 |
+
U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
|
532 |
+
U8* warpPtr = warpBase;
|
533 |
+
|
534 |
+
#if (CR_COARSE_WARPS > 8)
|
535 |
+
ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
536 |
+
#endif
|
537 |
+
#if (CR_COARSE_WARPS > 4)
|
538 |
+
ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
539 |
+
#endif
|
540 |
+
#if (CR_COARSE_WARPS > 2)
|
541 |
+
ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
542 |
+
#endif
|
543 |
+
#if (CR_COARSE_WARPS > 1)
|
544 |
+
ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
545 |
+
#endif
|
546 |
+
|
547 |
+
int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
|
548 |
+
U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
|
549 |
+
int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
|
550 |
+
|
551 |
+
// Find thread in warp.
|
552 |
+
|
553 |
+
int threadInWarp = 0;
|
554 |
+
int pop = __popc(emitMask & 0xFFFF);
|
555 |
+
bool pred = (emitInWarp >= pop);
|
556 |
+
if (pred) emitInWarp -= pop;
|
557 |
+
if (pred) emitMask >>= 0x10;
|
558 |
+
if (pred) threadInWarp += 0x10;
|
559 |
+
|
560 |
+
pop = __popc(emitMask & 0xFF);
|
561 |
+
pred = (emitInWarp >= pop);
|
562 |
+
if (pred) emitInWarp -= pop;
|
563 |
+
if (pred) emitMask >>= 0x08;
|
564 |
+
if (pred) threadInWarp += 0x08;
|
565 |
+
|
566 |
+
pop = __popc(emitMask & 0xF);
|
567 |
+
pred = (emitInWarp >= pop);
|
568 |
+
if (pred) emitInWarp -= pop;
|
569 |
+
if (pred) emitMask >>= 0x04;
|
570 |
+
if (pred) threadInWarp += 0x04;
|
571 |
+
|
572 |
+
pop = __popc(emitMask & 0x3);
|
573 |
+
pred = (emitInWarp >= pop);
|
574 |
+
if (pred) emitInWarp -= pop;
|
575 |
+
if (pred) emitMask >>= 0x02;
|
576 |
+
if (pred) threadInWarp += 0x02;
|
577 |
+
|
578 |
+
if (emitInWarp >= (emitMask & 1))
|
579 |
+
threadInWarp++;
|
580 |
+
|
581 |
+
// Figure out where to write.
|
582 |
+
|
583 |
+
int currOfs = s_tileStreamCurrOfs[tileInBin];
|
584 |
+
int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
|
585 |
+
int outOfs = emitInTile;
|
586 |
+
|
587 |
+
if (outOfs < spaceLeft)
|
588 |
+
outOfs += currOfs;
|
589 |
+
else
|
590 |
+
{
|
591 |
+
int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
|
592 |
+
outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
|
593 |
+
}
|
594 |
+
|
595 |
+
// Write.
|
596 |
+
|
597 |
+
int queueIdx = warpInTile * 32 + threadInWarp;
|
598 |
+
int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
|
599 |
+
|
600 |
+
tileSegData[outOfs] = triIdx;
|
601 |
+
}
|
602 |
+
|
603 |
+
//------------------------------------------------------------------------
|
604 |
+
// Patch.
|
605 |
+
//------------------------------------------------------------------------
|
606 |
+
|
607 |
+
// Allocated segment per thread: Initialize next-pointer and count.
|
608 |
+
|
609 |
+
for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
|
610 |
+
{
|
611 |
+
int segIdx = firstAllocSeg + i;
|
612 |
+
tileSegNext[segIdx] = segIdx + 1;
|
613 |
+
tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
|
614 |
+
}
|
615 |
+
|
616 |
+
// Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
|
617 |
+
|
618 |
+
__syncthreads();
|
619 |
+
for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
620 |
+
{
|
621 |
+
int oldOfs = s_tileStreamCurrOfs[tileInBin];
|
622 |
+
int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
|
623 |
+
int allocLo = s_tileAllocPrefixSum[tileInBin];
|
624 |
+
int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
|
625 |
+
|
626 |
+
if (allocLo != allocHi)
|
627 |
+
{
|
628 |
+
S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
|
629 |
+
if (oldOfs < 0)
|
630 |
+
nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
|
631 |
+
*nextPtr = firstAllocSeg + allocLo;
|
632 |
+
|
633 |
+
newOfs--;
|
634 |
+
newOfs &= CR_TILE_SEG_SIZE - 1;
|
635 |
+
newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
|
636 |
+
newOfs++;
|
637 |
+
}
|
638 |
+
s_tileStreamCurrOfs[tileInBin] = newOfs;
|
639 |
+
}
|
640 |
+
|
641 |
+
// Advance queue read pointer.
|
642 |
+
// Queue became empty => bin done.
|
643 |
+
|
644 |
+
triQueueReadPos += CR_COARSE_WARPS * 32;
|
645 |
+
}
|
646 |
+
while (triQueueReadPos < triQueueWritePos);
|
647 |
+
|
648 |
+
// Tile per thread: Fix next-pointer and count of the last segment.
|
649 |
+
// 32 tiles per warp: Count active tiles.
|
650 |
+
|
651 |
+
__syncthreads();
|
652 |
+
|
653 |
+
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
654 |
+
{
|
655 |
+
int tileInBin = tileInBin_base + thrInBlock;
|
656 |
+
bool act = (tileInBin < CR_BIN_SQR);
|
657 |
+
U32 actMask = __ballot_sync(~0u, act);
|
658 |
+
if (act)
|
659 |
+
{
|
660 |
+
int tileX = tileInBin & (CR_BIN_SIZE - 1);
|
661 |
+
int tileY = tileInBin >> CR_BIN_LOG2;
|
662 |
+
bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
|
663 |
+
|
664 |
+
int ofs = s_tileStreamCurrOfs[tileInBin];
|
665 |
+
int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
|
666 |
+
int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
|
667 |
+
|
668 |
+
if (ofs >= 0)
|
669 |
+
tileSegNext[segIdx] = -1;
|
670 |
+
else if (force)
|
671 |
+
{
|
672 |
+
s_tileStreamCurrOfs[tileInBin] = 0;
|
673 |
+
tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
|
674 |
+
}
|
675 |
+
|
676 |
+
if (segCount != 0)
|
677 |
+
tileSegCount[segIdx] = segCount;
|
678 |
+
|
679 |
+
U32 res = __ballot_sync(actMask, ofs >= 0 | force);
|
680 |
+
if (threadIdx.x == 0)
|
681 |
+
s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
|
682 |
+
}
|
683 |
+
}
|
684 |
+
|
685 |
+
// First warp: Scan-8.
|
686 |
+
// One thread: Allocate space for active tiles.
|
687 |
+
|
688 |
+
__syncthreads();
|
689 |
+
|
690 |
+
bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
|
691 |
+
U32 scan8Mask = __ballot_sync(~0u, scan8);
|
692 |
+
if (scan8)
|
693 |
+
{
|
694 |
+
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
695 |
+
U32 sum = v[0];
|
696 |
+
#if (CR_BIN_SQR > 1 * 32)
|
697 |
+
sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
698 |
+
#endif
|
699 |
+
#if (CR_BIN_SQR > 2 * 32)
|
700 |
+
sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
701 |
+
#endif
|
702 |
+
#if (CR_BIN_SQR > 4 * 32)
|
703 |
+
sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
704 |
+
#endif
|
705 |
+
|
706 |
+
if (thrInBlock == CR_BIN_SQR / 32 - 1)
|
707 |
+
s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
|
708 |
+
}
|
709 |
+
|
710 |
+
// Tile per thread: Output active tiles.
|
711 |
+
|
712 |
+
__syncthreads();
|
713 |
+
|
714 |
+
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
715 |
+
{
|
716 |
+
int tileInBin = tileInBin_base + thrInBlock;
|
717 |
+
bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
|
718 |
+
U32 actMask = __ballot_sync(~0u, act);
|
719 |
+
if (act)
|
720 |
+
{
|
721 |
+
int activeIdx = s_firstActiveIdx;
|
722 |
+
activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
|
723 |
+
activeIdx += __popc(actMask & getLaneMaskLt());
|
724 |
+
activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
|
725 |
+
}
|
726 |
+
}
|
727 |
+
}
|
728 |
+
}
|
729 |
+
|
730 |
+
//------------------------------------------------------------------------
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Constants.hpp
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
|
11 |
+
//------------------------------------------------------------------------
|
12 |
+
|
13 |
+
#define CR_MAXVIEWPORT_LOG2 11 // ViewportSize / PixelSize.
|
14 |
+
#define CR_SUBPIXEL_LOG2 4 // PixelSize / SubpixelSize.
|
15 |
+
|
16 |
+
#define CR_MAXBINS_LOG2 4 // ViewportSize / BinSize.
|
17 |
+
#define CR_BIN_LOG2 4 // BinSize / TileSize.
|
18 |
+
#define CR_TILE_LOG2 3 // TileSize / PixelSize.
|
19 |
+
|
20 |
+
#define CR_COVER8X8_LUT_SIZE 768 // 64-bit entries.
|
21 |
+
#define CR_FLIPBIT_FLIP_Y 2
|
22 |
+
#define CR_FLIPBIT_FLIP_X 3
|
23 |
+
#define CR_FLIPBIT_SWAP_XY 4
|
24 |
+
#define CR_FLIPBIT_COMPL 5
|
25 |
+
|
26 |
+
#define CR_BIN_STREAMS_LOG2 4
|
27 |
+
#define CR_BIN_SEG_LOG2 9 // 32-bit entries.
|
28 |
+
#define CR_TILE_SEG_LOG2 5 // 32-bit entries.
|
29 |
+
|
30 |
+
#define CR_MAXSUBTRIS_LOG2 24 // Triangle structs. Dictated by CoarseRaster.
|
31 |
+
#define CR_COARSE_QUEUE_LOG2 10 // Triangles.
|
32 |
+
|
33 |
+
#define CR_SETUP_WARPS 2
|
34 |
+
#define CR_SETUP_OPT_BLOCKS 8
|
35 |
+
#define CR_BIN_WARPS 16
|
36 |
+
#define CR_COARSE_WARPS 16 // Must be a power of two.
|
37 |
+
#define CR_FINE_MAX_WARPS 20
|
38 |
+
|
39 |
+
#define CR_EMBED_IMAGE_PARAMS 32 // Number of per-image parameter structs embedded in kernel launch parameter block.
|
40 |
+
|
41 |
+
//------------------------------------------------------------------------
|
42 |
+
|
43 |
+
#define CR_MAXVIEWPORT_SIZE (1 << CR_MAXVIEWPORT_LOG2)
|
44 |
+
#define CR_SUBPIXEL_SIZE (1 << CR_SUBPIXEL_LOG2)
|
45 |
+
#define CR_SUBPIXEL_SQR (1 << (CR_SUBPIXEL_LOG2 * 2))
|
46 |
+
|
47 |
+
#define CR_MAXBINS_SIZE (1 << CR_MAXBINS_LOG2)
|
48 |
+
#define CR_MAXBINS_SQR (1 << (CR_MAXBINS_LOG2 * 2))
|
49 |
+
#define CR_BIN_SIZE (1 << CR_BIN_LOG2)
|
50 |
+
#define CR_BIN_SQR (1 << (CR_BIN_LOG2 * 2))
|
51 |
+
|
52 |
+
#define CR_MAXTILES_LOG2 (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
|
53 |
+
#define CR_MAXTILES_SIZE (1 << CR_MAXTILES_LOG2)
|
54 |
+
#define CR_MAXTILES_SQR (1 << (CR_MAXTILES_LOG2 * 2))
|
55 |
+
#define CR_TILE_SIZE (1 << CR_TILE_LOG2)
|
56 |
+
#define CR_TILE_SQR (1 << (CR_TILE_LOG2 * 2))
|
57 |
+
|
58 |
+
#define CR_BIN_STREAMS_SIZE (1 << CR_BIN_STREAMS_LOG2)
|
59 |
+
#define CR_BIN_SEG_SIZE (1 << CR_BIN_SEG_LOG2)
|
60 |
+
#define CR_TILE_SEG_SIZE (1 << CR_TILE_SEG_LOG2)
|
61 |
+
|
62 |
+
#define CR_MAXSUBTRIS_SIZE (1 << CR_MAXSUBTRIS_LOG2)
|
63 |
+
#define CR_COARSE_QUEUE_SIZE (1 << CR_COARSE_QUEUE_LOG2)
|
64 |
+
|
65 |
+
//------------------------------------------------------------------------
|
66 |
+
// When evaluating interpolated Z pixel centers, we may introduce an error
|
67 |
+
// of (+-CR_LERP_ERROR) ULPs.
|
68 |
+
|
69 |
+
#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
|
70 |
+
#define CR_DEPTH_MIN CR_LERP_ERROR(3)
|
71 |
+
#define CR_DEPTH_MAX (CR_U32_MAX - CR_LERP_ERROR(3))
|
72 |
+
|
73 |
+
//------------------------------------------------------------------------
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CudaRaster.cpp
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#include "Defs.hpp"
|
10 |
+
#include "../CudaRaster.hpp"
|
11 |
+
#include "RasterImpl.hpp"
|
12 |
+
|
13 |
+
using namespace CR;
|
14 |
+
|
15 |
+
//------------------------------------------------------------------------
|
16 |
+
// Stub interface implementation.
|
17 |
+
//------------------------------------------------------------------------
|
18 |
+
|
19 |
+
CudaRaster::CudaRaster()
|
20 |
+
{
|
21 |
+
m_impl = new RasterImpl();
|
22 |
+
}
|
23 |
+
|
24 |
+
CudaRaster::~CudaRaster()
|
25 |
+
{
|
26 |
+
delete m_impl;
|
27 |
+
}
|
28 |
+
|
29 |
+
void CudaRaster::setBufferSize(int width, int height, int numImages)
|
30 |
+
{
|
31 |
+
m_impl->setBufferSize(Vec3i(width, height, numImages));
|
32 |
+
}
|
33 |
+
|
34 |
+
void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
|
35 |
+
{
|
36 |
+
m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
|
37 |
+
}
|
38 |
+
|
39 |
+
void CudaRaster::setRenderModeFlags(U32 flags)
|
40 |
+
{
|
41 |
+
m_impl->setRenderModeFlags(flags);
|
42 |
+
}
|
43 |
+
|
44 |
+
void CudaRaster::deferredClear(U32 clearColor)
|
45 |
+
{
|
46 |
+
m_impl->deferredClear(clearColor);
|
47 |
+
}
|
48 |
+
|
49 |
+
void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
|
50 |
+
{
|
51 |
+
m_impl->setVertexBuffer(vertices, numVertices);
|
52 |
+
}
|
53 |
+
|
54 |
+
void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
|
55 |
+
{
|
56 |
+
m_impl->setIndexBuffer(indices, numTriangles);
|
57 |
+
}
|
58 |
+
|
59 |
+
bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
|
60 |
+
{
|
61 |
+
return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
|
62 |
+
}
|
63 |
+
|
64 |
+
void* CudaRaster::getColorBuffer(void)
|
65 |
+
{
|
66 |
+
return m_impl->getColorBuffer();
|
67 |
+
}
|
68 |
+
|
69 |
+
void* CudaRaster::getDepthBuffer(void)
|
70 |
+
{
|
71 |
+
return m_impl->getDepthBuffer();
|
72 |
+
}
|
73 |
+
|
74 |
+
void CudaRaster::swapDepthAndPeel(void)
|
75 |
+
{
|
76 |
+
m_impl->swapDepthAndPeel();
|
77 |
+
}
|
78 |
+
|
79 |
+
//------------------------------------------------------------------------
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Defs.hpp
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
#include <cuda_runtime.h>
|
11 |
+
#include <cstdint>
|
12 |
+
|
13 |
+
namespace CR
|
14 |
+
{
|
15 |
+
//------------------------------------------------------------------------
|
16 |
+
|
17 |
+
#ifndef NULL
|
18 |
+
# define NULL 0
|
19 |
+
#endif
|
20 |
+
|
21 |
+
#ifdef __CUDACC__
|
22 |
+
# define CR_CUDA 1
|
23 |
+
#else
|
24 |
+
# define CR_CUDA 0
|
25 |
+
#endif
|
26 |
+
|
27 |
+
#if CR_CUDA
|
28 |
+
# define CR_CUDA_FUNC __device__ __inline__
|
29 |
+
# define CR_CUDA_CONST __constant__
|
30 |
+
#else
|
31 |
+
# define CR_CUDA_FUNC inline
|
32 |
+
# define CR_CUDA_CONST static const
|
33 |
+
#endif
|
34 |
+
|
35 |
+
#define CR_UNREF(X) ((void)(X))
|
36 |
+
#define CR_ARRAY_SIZE(X) ((int)(sizeof(X) / sizeof((X)[0])))
|
37 |
+
|
38 |
+
//------------------------------------------------------------------------
|
39 |
+
|
40 |
+
typedef uint8_t U8;
|
41 |
+
typedef uint16_t U16;
|
42 |
+
typedef uint32_t U32;
|
43 |
+
typedef uint64_t U64;
|
44 |
+
typedef int8_t S8;
|
45 |
+
typedef int16_t S16;
|
46 |
+
typedef int32_t S32;
|
47 |
+
typedef int64_t S64;
|
48 |
+
typedef float F32;
|
49 |
+
typedef double F64;
|
50 |
+
typedef void (*FuncPtr)(void);
|
51 |
+
|
52 |
+
//------------------------------------------------------------------------
|
53 |
+
|
54 |
+
#define CR_U32_MAX (0xFFFFFFFFu)
|
55 |
+
#define CR_S32_MIN (~0x7FFFFFFF)
|
56 |
+
#define CR_S32_MAX (0x7FFFFFFF)
|
57 |
+
#define CR_U64_MAX ((U64)(S64)-1)
|
58 |
+
#define CR_S64_MIN ((S64)-1 << 63)
|
59 |
+
#define CR_S64_MAX (~((S64)-1 << 63))
|
60 |
+
#define CR_F32_MIN (1.175494351e-38f)
|
61 |
+
#define CR_F32_MAX (3.402823466e+38f)
|
62 |
+
#define CR_F64_MIN (2.2250738585072014e-308)
|
63 |
+
#define CR_F64_MAX (1.7976931348623158e+308)
|
64 |
+
|
65 |
+
//------------------------------------------------------------------------
|
66 |
+
// Misc types.
|
67 |
+
|
68 |
+
class Vec2i
|
69 |
+
{
|
70 |
+
public:
|
71 |
+
Vec2i(int x_, int y_) : x(x_), y(y_) {}
|
72 |
+
int x, y;
|
73 |
+
};
|
74 |
+
|
75 |
+
class Vec3i
|
76 |
+
{
|
77 |
+
public:
|
78 |
+
Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
|
79 |
+
int x, y, z;
|
80 |
+
};
|
81 |
+
|
82 |
+
//------------------------------------------------------------------------
|
83 |
+
// CUDA utilities.
|
84 |
+
|
85 |
+
#if CR_CUDA
|
86 |
+
# define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
|
87 |
+
#endif
|
88 |
+
|
89 |
+
//------------------------------------------------------------------------
|
90 |
+
} // namespace CR
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_FineRaster.inl
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
//------------------------------------------------------------------------
|
10 |
+
// Utility funcs.
|
11 |
+
//------------------------------------------------------------------------
|
12 |
+
|
13 |
+
__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
|
14 |
+
{
|
15 |
+
tileZMax = CR_DEPTH_MAX;
|
16 |
+
tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
|
17 |
+
}
|
18 |
+
|
19 |
+
__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
|
20 |
+
{
|
21 |
+
// Entry is warp-coherent.
|
22 |
+
if (__any_sync(~0u, tileZUpd))
|
23 |
+
{
|
24 |
+
U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
|
25 |
+
temp[threadIdx.x + 16] = z; __syncwarp();
|
26 |
+
z = ::max(z, temp[threadIdx.x + 16 - 1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
27 |
+
z = ::max(z, temp[threadIdx.x + 16 - 2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
28 |
+
z = ::max(z, temp[threadIdx.x + 16 - 4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
29 |
+
z = ::max(z, temp[threadIdx.x + 16 - 8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
30 |
+
z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
|
31 |
+
tileZMax = temp[47];
|
32 |
+
tileZUpd = false;
|
33 |
+
}
|
34 |
+
}
|
35 |
+
|
36 |
+
//------------------------------------------------------------------------
|
37 |
+
|
38 |
+
__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
|
39 |
+
{
|
40 |
+
const CRTriangleHeader* triHeaderPtr = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
|
41 |
+
const S32* tileSegData = (const S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
|
42 |
+
const S32* tileSegNext = (const S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
|
43 |
+
const S32* tileSegCount = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
|
44 |
+
|
45 |
+
if (threadIdx.x >= tileSegCount[segment])
|
46 |
+
{
|
47 |
+
triIdx = -1;
|
48 |
+
dataIdx = -1;
|
49 |
+
}
|
50 |
+
else
|
51 |
+
{
|
52 |
+
int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
|
53 |
+
triIdx = subtriIdx >> 3;
|
54 |
+
dataIdx = triIdx;
|
55 |
+
subtriIdx &= 7;
|
56 |
+
if (subtriIdx != 7)
|
57 |
+
dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
|
58 |
+
triHeader = *((uint4*)triHeaderPtr + dataIdx);
|
59 |
+
}
|
60 |
+
|
61 |
+
// advance to next segment
|
62 |
+
segment = tileSegNext[segment];
|
63 |
+
}
|
64 |
+
|
65 |
+
//------------------------------------------------------------------------
|
66 |
+
|
67 |
+
__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
|
68 |
+
{
|
69 |
+
U32 zmin = triHeader.w & 0xFFFFF000;
|
70 |
+
return (zmin > tileZMax);
|
71 |
+
}
|
72 |
+
|
73 |
+
//------------------------------------------------------------------------
|
74 |
+
|
75 |
+
__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
|
76 |
+
{
|
77 |
+
int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
|
78 |
+
int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
|
79 |
+
|
80 |
+
// extract S16 vertex positions while subtracting tile coordinates
|
81 |
+
S32 v0x = sub_s16lo_s16lo(triHeader.x, baseX);
|
82 |
+
S32 v0y = sub_s16hi_s16lo(triHeader.x, baseY);
|
83 |
+
S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
|
84 |
+
S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
|
85 |
+
S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
|
86 |
+
S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
|
87 |
+
|
88 |
+
// extract flipbits
|
89 |
+
U32 f01 = (triHeader.w >> 6) & 0x3C;
|
90 |
+
U32 f12 = (triHeader.w >> 2) & 0x3C;
|
91 |
+
U32 f20 = (triHeader.w << 2) & 0x3C;
|
92 |
+
|
93 |
+
// compute per-edge coverage masks
|
94 |
+
U64 c01, c12, c20;
|
95 |
+
c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
|
96 |
+
c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
|
97 |
+
c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
|
98 |
+
|
99 |
+
// combine masks
|
100 |
+
return c01 & c12 & c20;
|
101 |
+
}
|
102 |
+
|
103 |
+
//------------------------------------------------------------------------
|
104 |
+
|
105 |
+
__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
|
106 |
+
{
|
107 |
+
__syncwarp();
|
108 |
+
temp[threadIdx.x + 16] = value; __syncwarp();
|
109 |
+
value += temp[threadIdx.x + 16 - 1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
110 |
+
value += temp[threadIdx.x + 16 - 2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
111 |
+
value += temp[threadIdx.x + 16 - 4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
112 |
+
value += temp[threadIdx.x + 16 - 8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
113 |
+
value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
|
114 |
+
return value;
|
115 |
+
}
|
116 |
+
|
117 |
+
__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
|
118 |
+
{
|
119 |
+
return temp[47];
|
120 |
+
}
|
121 |
+
|
122 |
+
//------------------------------------------------------------------------
|
123 |
+
|
124 |
+
__device__ __inline__ S32 findBit(U64 mask, int idx)
|
125 |
+
{
|
126 |
+
U32 x = getLo(mask);
|
127 |
+
int pop = __popc(x);
|
128 |
+
bool p = (pop <= idx);
|
129 |
+
if (p) x = getHi(mask);
|
130 |
+
if (p) idx -= pop;
|
131 |
+
int bit = p ? 32 : 0;
|
132 |
+
|
133 |
+
pop = __popc(x & 0x0000ffffu);
|
134 |
+
p = (pop <= idx);
|
135 |
+
if (p) x >>= 16;
|
136 |
+
if (p) bit += 16;
|
137 |
+
if (p) idx -= pop;
|
138 |
+
|
139 |
+
U32 tmp = x & 0x000000ffu;
|
140 |
+
pop = __popc(tmp);
|
141 |
+
p = (pop <= idx);
|
142 |
+
if (p) tmp = x & 0x0000ff00u;
|
143 |
+
if (p) idx -= pop;
|
144 |
+
|
145 |
+
return findLeadingOne(tmp) + bit - idx;
|
146 |
+
}
|
147 |
+
|
148 |
+
//------------------------------------------------------------------------
|
149 |
+
// Single-sample implementation.
|
150 |
+
//------------------------------------------------------------------------
|
151 |
+
|
152 |
+
__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
|
153 |
+
{
|
154 |
+
atomicMin((U32*)pDepth, depth);
|
155 |
+
__syncwarp(ropMask);
|
156 |
+
bool act = (depth == *pDepth);
|
157 |
+
__syncwarp(ropMask);
|
158 |
+
U32 actMask = __ballot_sync(ropMask, act);
|
159 |
+
if (act)
|
160 |
+
{
|
161 |
+
*pDepth = 0;
|
162 |
+
__syncwarp(actMask);
|
163 |
+
atomicMax((U32*)pDepth, threadIdx.x);
|
164 |
+
__syncwarp(actMask);
|
165 |
+
if (*pDepth == threadIdx.x)
|
166 |
+
{
|
167 |
+
*pDepth = depth;
|
168 |
+
*pColor = color;
|
169 |
+
}
|
170 |
+
__syncwarp(actMask);
|
171 |
+
}
|
172 |
+
}
|
173 |
+
|
174 |
+
//------------------------------------------------------------------------
|
175 |
+
|
176 |
+
__device__ __inline__ void fineRasterImpl(const CRParams p)
|
177 |
+
{
|
178 |
+
// for 20 warps:
|
179 |
+
__shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE]; // 6KB
|
180 |
+
__shared__ volatile U32 s_tileColor [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
|
181 |
+
__shared__ volatile U32 s_tileDepth [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
|
182 |
+
__shared__ volatile U32 s_tilePeel [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
|
183 |
+
__shared__ volatile U32 s_triDataIdx [CR_FINE_MAX_WARPS][64]; // 5KB CRTriangleData index
|
184 |
+
__shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64]; // 10KB coverage mask
|
185 |
+
__shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64]; // 5KB fragment index
|
186 |
+
__shared__ volatile U32 s_temp [CR_FINE_MAX_WARPS][80]; // 6.25KB
|
187 |
+
// = 47.25KB total
|
188 |
+
|
189 |
+
CRAtomics& atomics = p.atomics[blockIdx.z];
|
190 |
+
const CRTriangleData* triData = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
|
191 |
+
|
192 |
+
const S32* activeTiles = (const S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
|
193 |
+
const S32* tileFirstSeg = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
|
194 |
+
|
195 |
+
volatile U32* tileColor = s_tileColor[threadIdx.y];
|
196 |
+
volatile U32* tileDepth = s_tileDepth[threadIdx.y];
|
197 |
+
volatile U32* tilePeel = s_tilePeel[threadIdx.y];
|
198 |
+
volatile U32* triDataIdx = s_triDataIdx[threadIdx.y];
|
199 |
+
volatile U64* triangleCov = s_triangleCov[threadIdx.y];
|
200 |
+
volatile U32* triangleFrag = s_triangleFrag[threadIdx.y];
|
201 |
+
volatile U32* temp = s_temp[threadIdx.y];
|
202 |
+
|
203 |
+
if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
|
204 |
+
return;
|
205 |
+
|
206 |
+
temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
|
207 |
+
cover8x8_setupLUT(s_cover8x8_lut);
|
208 |
+
__syncthreads();
|
209 |
+
|
210 |
+
// loop over tiles
|
211 |
+
for (;;)
|
212 |
+
{
|
213 |
+
// pick a tile
|
214 |
+
if (threadIdx.x == 0)
|
215 |
+
temp[16] = atomicAdd(&atomics.fineCounter, 1);
|
216 |
+
__syncwarp();
|
217 |
+
int activeIdx = temp[16];
|
218 |
+
if (activeIdx >= atomics.numActiveTiles)
|
219 |
+
break;
|
220 |
+
|
221 |
+
int tileIdx = activeTiles[activeIdx];
|
222 |
+
S32 segment = tileFirstSeg[tileIdx];
|
223 |
+
int tileY = tileIdx / p.widthTiles;
|
224 |
+
int tileX = tileIdx - tileY * p.widthTiles;
|
225 |
+
int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
|
226 |
+
int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
|
227 |
+
|
228 |
+
// initialize per-tile state
|
229 |
+
int triRead = 0, triWrite = 0;
|
230 |
+
int fragRead = 0, fragWrite = 0;
|
231 |
+
if (threadIdx.x == 0)
|
232 |
+
triangleFrag[63] = 0; // "previous triangle"
|
233 |
+
|
234 |
+
// deferred clear => clear tile
|
235 |
+
if (p.deferredClear)
|
236 |
+
{
|
237 |
+
tileColor[threadIdx.x] = p.clearColor;
|
238 |
+
tileDepth[threadIdx.x] = p.clearDepth;
|
239 |
+
tileColor[threadIdx.x + 32] = p.clearColor;
|
240 |
+
tileDepth[threadIdx.x + 32] = p.clearDepth;
|
241 |
+
}
|
242 |
+
else // otherwise => read tile from framebuffer
|
243 |
+
{
|
244 |
+
U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
|
245 |
+
U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
|
246 |
+
tileColor[threadIdx.x] = pColor[px + p.strideX * py];
|
247 |
+
tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
|
248 |
+
tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
|
249 |
+
tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
|
250 |
+
}
|
251 |
+
|
252 |
+
// read peeling inputs if enabled
|
253 |
+
if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
|
254 |
+
{
|
255 |
+
U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
|
256 |
+
tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
|
257 |
+
tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
|
258 |
+
}
|
259 |
+
|
260 |
+
U32 tileZMax;
|
261 |
+
bool tileZUpd;
|
262 |
+
initTileZMax(tileZMax, tileZUpd, tileDepth);
|
263 |
+
|
264 |
+
// process fragments
|
265 |
+
for(;;)
|
266 |
+
{
|
267 |
+
// need to queue more fragments?
|
268 |
+
if (fragWrite - fragRead < 32 && segment >= 0)
|
269 |
+
{
|
270 |
+
// update tile z - coherent over warp
|
271 |
+
updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
|
272 |
+
|
273 |
+
// read triangles
|
274 |
+
do
|
275 |
+
{
|
276 |
+
// read triangle index and data, advance to next segment
|
277 |
+
S32 triIdx, dataIdx;
|
278 |
+
uint4 triHeader;
|
279 |
+
getTriangle(p, triIdx, dataIdx, triHeader, segment);
|
280 |
+
|
281 |
+
// early z cull
|
282 |
+
if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
|
283 |
+
triIdx = -1;
|
284 |
+
|
285 |
+
// determine coverage
|
286 |
+
U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
|
287 |
+
S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
|
288 |
+
|
289 |
+
// fragment count scan
|
290 |
+
U32 frag = scan32_value(pop, temp);
|
291 |
+
frag += fragWrite; // frag now holds cumulative fragment count
|
292 |
+
fragWrite += scan32_total(temp);
|
293 |
+
|
294 |
+
// queue non-empty triangles
|
295 |
+
U32 goodMask = __ballot_sync(~0u, pop != 0);
|
296 |
+
if (pop != 0)
|
297 |
+
{
|
298 |
+
int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
|
299 |
+
triDataIdx [idx] = dataIdx;
|
300 |
+
triangleFrag[idx] = frag;
|
301 |
+
triangleCov [idx] = coverage;
|
302 |
+
}
|
303 |
+
triWrite += __popc(goodMask);
|
304 |
+
}
|
305 |
+
while (fragWrite - fragRead < 32 && segment >= 0);
|
306 |
+
}
|
307 |
+
__syncwarp();
|
308 |
+
|
309 |
+
// end of segment?
|
310 |
+
if (fragRead == fragWrite)
|
311 |
+
break;
|
312 |
+
|
313 |
+
// clear triangle boundaries
|
314 |
+
temp[threadIdx.x + 16] = 0;
|
315 |
+
__syncwarp();
|
316 |
+
|
317 |
+
// tag triangle boundaries
|
318 |
+
if (triRead + threadIdx.x < triWrite)
|
319 |
+
{
|
320 |
+
int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
|
321 |
+
if (idx <= 32)
|
322 |
+
temp[idx + 16 - 1] = 1;
|
323 |
+
}
|
324 |
+
__syncwarp();
|
325 |
+
|
326 |
+
int ropLaneIdx = threadIdx.x;
|
327 |
+
U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
|
328 |
+
|
329 |
+
// distribute fragments
|
330 |
+
bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
|
331 |
+
U32 fragmentMask = __ballot_sync(~0u, hasFragment);
|
332 |
+
if (hasFragment)
|
333 |
+
{
|
334 |
+
int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
|
335 |
+
int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
|
336 |
+
U64 coverage = triangleCov[triBufIdx];
|
337 |
+
int pixelInTile = findBit(coverage, fragIdx);
|
338 |
+
int dataIdx = triDataIdx[triBufIdx];
|
339 |
+
|
340 |
+
// determine pixel position
|
341 |
+
U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
|
342 |
+
U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
|
343 |
+
|
344 |
+
// depth test
|
345 |
+
U32 depth = 0;
|
346 |
+
uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
|
347 |
+
|
348 |
+
depth = td.x * pixelX + td.y * pixelY + td.z;
|
349 |
+
bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
|
350 |
+
if (!zkill)
|
351 |
+
{
|
352 |
+
U32 oldDepth = tileDepth[pixelInTile];
|
353 |
+
if (depth > oldDepth)
|
354 |
+
zkill = true;
|
355 |
+
else if (oldDepth == tileZMax)
|
356 |
+
tileZUpd = true; // we are replacing previous zmax => need to update
|
357 |
+
}
|
358 |
+
|
359 |
+
U32 ropMask = __ballot_sync(fragmentMask, !zkill);
|
360 |
+
if (!zkill)
|
361 |
+
executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
|
362 |
+
}
|
363 |
+
// no need to sync, as next up is updateTileZMax that does internal warp sync
|
364 |
+
|
365 |
+
// update counters
|
366 |
+
fragRead = ::min(fragRead + 32, fragWrite);
|
367 |
+
triRead += __popc(boundaryMask);
|
368 |
+
}
|
369 |
+
|
370 |
+
// Write tile back to the framebuffer.
|
371 |
+
if (true)
|
372 |
+
{
|
373 |
+
int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
|
374 |
+
int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
|
375 |
+
U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
|
376 |
+
U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
|
377 |
+
pColor[px + p.strideX * py] = tileColor[threadIdx.x];
|
378 |
+
pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
|
379 |
+
pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
|
380 |
+
pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
|
381 |
+
}
|
382 |
+
}
|
383 |
+
}
|
384 |
+
|
385 |
+
//------------------------------------------------------------------------
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_PrivateDefs.hpp
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#pragma once
|
10 |
+
#include "Defs.hpp"
|
11 |
+
#include "Constants.hpp"
|
12 |
+
|
13 |
+
namespace CR
|
14 |
+
{
|
15 |
+
//------------------------------------------------------------------------
|
16 |
+
// Projected triangle.
|
17 |
+
//------------------------------------------------------------------------
|
18 |
+
|
19 |
+
struct CRTriangleHeader
|
20 |
+
{
|
21 |
+
S16 v0x; // Subpixels relative to viewport center. Valid if triSubtris = 1.
|
22 |
+
S16 v0y;
|
23 |
+
S16 v1x;
|
24 |
+
S16 v1y;
|
25 |
+
S16 v2x;
|
26 |
+
S16 v2y;
|
27 |
+
|
28 |
+
U32 misc; // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
|
29 |
+
};
|
30 |
+
|
31 |
+
//------------------------------------------------------------------------
|
32 |
+
|
33 |
+
struct CRTriangleData
|
34 |
+
{
|
35 |
+
U32 zx; // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
|
36 |
+
U32 zy;
|
37 |
+
U32 zb;
|
38 |
+
U32 id; // Triangle id.
|
39 |
+
};
|
40 |
+
|
41 |
+
//------------------------------------------------------------------------
|
42 |
+
// Device-side structures.
|
43 |
+
//------------------------------------------------------------------------
|
44 |
+
|
45 |
+
struct CRAtomics
|
46 |
+
{
|
47 |
+
// Setup.
|
48 |
+
S32 numSubtris; // = numTris
|
49 |
+
|
50 |
+
// Bin.
|
51 |
+
S32 binCounter; // = 0
|
52 |
+
S32 numBinSegs; // = 0
|
53 |
+
|
54 |
+
// Coarse.
|
55 |
+
S32 coarseCounter; // = 0
|
56 |
+
S32 numTileSegs; // = 0
|
57 |
+
S32 numActiveTiles; // = 0
|
58 |
+
|
59 |
+
// Fine.
|
60 |
+
S32 fineCounter; // = 0
|
61 |
+
};
|
62 |
+
|
63 |
+
//------------------------------------------------------------------------
|
64 |
+
|
65 |
+
struct CRImageParams
|
66 |
+
{
|
67 |
+
S32 triOffset; // First triangle index to draw.
|
68 |
+
S32 triCount; // Number of triangles to draw.
|
69 |
+
S32 binBatchSize; // Number of triangles per batch.
|
70 |
+
};
|
71 |
+
|
72 |
+
//------------------------------------------------------------------------
|
73 |
+
|
74 |
+
struct CRParams
|
75 |
+
{
|
76 |
+
// Common.
|
77 |
+
|
78 |
+
CRAtomics* atomics; // Work counters. Per-image.
|
79 |
+
S32 numImages; // Batch size.
|
80 |
+
S32 totalCount; // In range mode, total number of triangles to render.
|
81 |
+
S32 instanceMode; // 0 = range mode, 1 = instance mode.
|
82 |
+
|
83 |
+
S32 numVertices; // Number of vertices in input buffer, not counting multiples in instance mode.
|
84 |
+
S32 numTriangles; // Number of triangles in input buffer.
|
85 |
+
void* vertexBuffer; // numVertices * float4(x, y, z, w)
|
86 |
+
void* indexBuffer; // numTriangles * int3(vi0, vi1, vi2)
|
87 |
+
|
88 |
+
S32 widthPixels; // Render buffer size in pixels. Must be multiple of tile size (8x8).
|
89 |
+
S32 heightPixels;
|
90 |
+
S32 widthPixelsVp; // Viewport size in pixels.
|
91 |
+
S32 heightPixelsVp;
|
92 |
+
S32 widthBins; // widthPixels / CR_BIN_SIZE
|
93 |
+
S32 heightBins; // heightPixels / CR_BIN_SIZE
|
94 |
+
S32 numBins; // widthBins * heightBins
|
95 |
+
|
96 |
+
F32 xs; // Vertex position adjustments for tiled rendering.
|
97 |
+
F32 ys;
|
98 |
+
F32 xo;
|
99 |
+
F32 yo;
|
100 |
+
|
101 |
+
S32 widthTiles; // widthPixels / CR_TILE_SIZE
|
102 |
+
S32 heightTiles; // heightPixels / CR_TILE_SIZE
|
103 |
+
S32 numTiles; // widthTiles * heightTiles
|
104 |
+
|
105 |
+
U32 renderModeFlags;
|
106 |
+
S32 deferredClear; // 1 = Clear framebuffer before rendering triangles.
|
107 |
+
U32 clearColor;
|
108 |
+
U32 clearDepth;
|
109 |
+
|
110 |
+
// These are uniform across batch.
|
111 |
+
|
112 |
+
S32 maxSubtris;
|
113 |
+
S32 maxBinSegs;
|
114 |
+
S32 maxTileSegs;
|
115 |
+
|
116 |
+
// Setup output / bin input.
|
117 |
+
|
118 |
+
void* triSubtris; // maxSubtris * U8
|
119 |
+
void* triHeader; // maxSubtris * CRTriangleHeader
|
120 |
+
void* triData; // maxSubtris * CRTriangleData
|
121 |
+
|
122 |
+
// Bin output / coarse input.
|
123 |
+
|
124 |
+
void* binSegData; // maxBinSegs * CR_BIN_SEG_SIZE * S32
|
125 |
+
void* binSegNext; // maxBinSegs * S32
|
126 |
+
void* binSegCount; // maxBinSegs * S32
|
127 |
+
void* binFirstSeg; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
|
128 |
+
void* binTotal; // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
|
129 |
+
|
130 |
+
// Coarse output / fine input.
|
131 |
+
|
132 |
+
void* tileSegData; // maxTileSegs * CR_TILE_SEG_SIZE * S32
|
133 |
+
void* tileSegNext; // maxTileSegs * S32
|
134 |
+
void* tileSegCount; // maxTileSegs * S32
|
135 |
+
void* activeTiles; // CR_MAXTILES_SQR * (S32 tileIdx)
|
136 |
+
void* tileFirstSeg; // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
|
137 |
+
|
138 |
+
// Surface buffers. Outer tile offset is baked into pointers.
|
139 |
+
|
140 |
+
void* colorBuffer; // sizePixels.x * sizePixels.y * numImages * U32
|
141 |
+
void* depthBuffer; // sizePixels.x * sizePixels.y * numImages * U32
|
142 |
+
void* peelBuffer; // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
|
143 |
+
S32 strideX; // horizontal size in pixels
|
144 |
+
S32 strideY; // vertical stride in pixels
|
145 |
+
|
146 |
+
// Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
|
147 |
+
|
148 |
+
CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
|
149 |
+
const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
|
150 |
+
};
|
151 |
+
|
152 |
+
//------------------------------------------------------------------------
|
153 |
+
}
|
extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_RasterImpl.cpp
ADDED
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
//
|
3 |
+
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
+
// and proprietary rights in and to this software, related documentation
|
5 |
+
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
+
// distribution of this software and related documentation without an express
|
7 |
+
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
+
|
9 |
+
#include "../../framework.h"
|
10 |
+
#include "PrivateDefs.hpp"
|
11 |
+
#include "Constants.hpp"
|
12 |
+
#include "RasterImpl.hpp"
|
13 |
+
#include <cuda_runtime.h>
|
14 |
+
|
15 |
+
using namespace CR;
|
16 |
+
using std::min;
|
17 |
+
using std::max;
|
18 |
+
|
19 |
+
//------------------------------------------------------------------------
|
20 |
+
// Kernel prototypes and variables.
|
21 |
+
|
22 |
+
void triangleSetupKernel (const CRParams p);
|
23 |
+
void binRasterKernel (const CRParams p);
|
24 |
+
void coarseRasterKernel (const CRParams p);
|
25 |
+
void fineRasterKernel (const CRParams p);
|
26 |
+
|
27 |
+
//------------------------------------------------------------------------
|
28 |
+
|
29 |
+
RasterImpl::RasterImpl(void)
|
30 |
+
: m_renderModeFlags (0),
|
31 |
+
m_deferredClear (false),
|
32 |
+
m_clearColor (0),
|
33 |
+
m_vertexPtr (NULL),
|
34 |
+
m_indexPtr (NULL),
|
35 |
+
m_numVertices (0),
|
36 |
+
m_numTriangles (0),
|
37 |
+
m_bufferSizesReported (0),
|
38 |
+
|
39 |
+
m_numImages (0),
|
40 |
+
m_bufferSizePixels (0, 0),
|
41 |
+
m_bufferSizeVp (0, 0),
|
42 |
+
m_sizePixels (0, 0),
|
43 |
+
m_sizeVp (0, 0),
|
44 |
+
m_offsetPixels (0, 0),
|
45 |
+
m_sizeBins (0, 0),
|
46 |
+
m_numBins (0),
|
47 |
+
m_sizeTiles (0, 0),
|
48 |
+
m_numTiles (0),
|
49 |
+
|
50 |
+
m_numSMs (1),
|
51 |
+
m_numCoarseBlocksPerSM (1),
|
52 |
+
m_numFineBlocksPerSM (1),
|
53 |
+
m_numFineWarpsPerBlock (1),
|
54 |
+
|
55 |
+
m_maxSubtris (1),
|
56 |
+
m_maxBinSegs (1),
|
57 |
+
m_maxTileSegs (1)
|
58 |
+
{
|
59 |
+
// Query relevant device attributes.
|
60 |
+
|
61 |
+
int currentDevice = 0;
|
62 |
+
NVDR_CHECK_CUDA_ERROR(cudaGetDevice(¤tDevice));
|
63 |
+
NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
|
64 |
+
cudaFuncAttributes attr;
|
65 |
+
NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
|
66 |
+
m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
|
67 |
+
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
|
68 |
+
NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
|
69 |
+
|
70 |
+
// Setup functions.
|
71 |
+
|
72 |
+
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
|
73 |
+
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel, cudaFuncCachePreferShared));
|
74 |
+
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel, cudaFuncCachePreferShared));
|
75 |
+
NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel, cudaFuncCachePreferShared));
|
76 |
+
}
|
77 |
+
|
78 |
+
//------------------------------------------------------------------------
|
79 |
+
|
80 |
+
RasterImpl::~RasterImpl(void)
|
81 |
+
{
|
82 |
+
// Empty.
|
83 |
+
}
|
84 |
+
|
85 |
+
//------------------------------------------------------------------------
|
86 |
+
|
87 |
+
void RasterImpl::setBufferSize(Vec3i size)
|
88 |
+
{
|
89 |
+
// Internal buffer width and height must be divisible by tile size.
|
90 |
+
int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
91 |
+
int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
92 |
+
|
93 |
+
m_bufferSizePixels = Vec2i(w, h);
|
94 |
+
m_bufferSizeVp = Vec2i(size.x, size.y);
|
95 |
+
m_numImages = size.z;
|
96 |
+
|
97 |
+
m_colorBuffer.reset(w * h * size.z * sizeof(U32));
|
98 |
+
m_depthBuffer.reset(w * h * size.z * sizeof(U32));
|
99 |
+
}
|
100 |
+
|
101 |
+
//------------------------------------------------------------------------
|
102 |
+
|
103 |
+
void RasterImpl::setViewport(Vec2i size, Vec2i offset)
|
104 |
+
{
|
105 |
+
// Offset must be divisible by tile size.
|
106 |
+
NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
|
107 |
+
|
108 |
+
// Round internal viewport size to multiples of tile size.
|
109 |
+
int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
110 |
+
int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
|
111 |
+
|
112 |
+
m_sizePixels = Vec2i(w, h);
|
113 |
+
m_offsetPixels = offset;
|
114 |
+
m_sizeVp = Vec2i(size.x, size.y);
|
115 |
+
m_sizeTiles.x = m_sizePixels.x >> CR_TILE_LOG2;
|
116 |
+
m_sizeTiles.y = m_sizePixels.y >> CR_TILE_LOG2;
|
117 |
+
m_numTiles = m_sizeTiles.x * m_sizeTiles.y;
|
118 |
+
m_sizeBins.x = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
|
119 |
+
m_sizeBins.y = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
|
120 |
+
m_numBins = m_sizeBins.x * m_sizeBins.y;
|
121 |
+
}
|
122 |
+
|
123 |
+
void RasterImpl::swapDepthAndPeel(void)
|
124 |
+
{
|
125 |
+
m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
|
126 |
+
|
127 |
+
void* tmp = m_depthBuffer.getPtr();
|
128 |
+
m_depthBuffer.setPtr(m_peelBuffer.getPtr());
|
129 |
+
m_peelBuffer.setPtr(tmp);
|
130 |
+
}
|
131 |
+
|
132 |
+
//------------------------------------------------------------------------
|
133 |
+
|
134 |
+
bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
|
135 |
+
{
|
136 |
+
bool instanceMode = (!ranges);
|
137 |
+
|
138 |
+
int maxSubtrisSlack = 4096; // x 81B = 324KB
|
139 |
+
int maxBinSegsSlack = 256; // x 2137B = 534KB
|
140 |
+
int maxTileSegsSlack = 4096; // x 136B = 544KB
|
141 |
+
|
142 |
+
// Resize atomics as needed.
|
143 |
+
m_crAtomics .grow(m_numImages * sizeof(CRAtomics));
|
144 |
+
m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
|
145 |
+
|
146 |
+
// Size of these buffers doesn't depend on input.
|
147 |
+
m_binFirstSeg .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
|
148 |
+
m_binTotal .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
|
149 |
+
m_activeTiles .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
|
150 |
+
m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
|
151 |
+
|
152 |
+
// Construct per-image parameters and determine worst-case buffer sizes.
|
153 |
+
m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
|
154 |
+
CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
|
155 |
+
for (int i=0; i < m_numImages; i++)
|
156 |
+
{
|
157 |
+
CRImageParams& ip = imageParams[i];
|
158 |
+
|
159 |
+
int roundSize = CR_BIN_WARPS * 32;
|
160 |
+
int minBatches = CR_BIN_STREAMS_SIZE * 2;
|
161 |
+
int maxRounds = 32;
|
162 |
+
|
163 |
+
ip.triOffset = instanceMode ? 0 : ranges[i].x;
|
164 |
+
ip.triCount = instanceMode ? m_numTriangles : ranges[i].y;
|
165 |
+
ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
|
166 |
+
|
167 |
+
m_maxSubtris = max(m_maxSubtris, min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
|
168 |
+
m_maxBinSegs = max(m_maxBinSegs, max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
|
169 |
+
m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
|
170 |
+
}
|
171 |
+
|
172 |
+
// Retry until successful.
|
173 |
+
|
174 |
+
for (;;)
|
175 |
+
{
|
176 |
+
// Allocate buffers.
|
177 |
+
m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
|
178 |
+
m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
|
179 |
+
m_triData .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
|
180 |
+
|
181 |
+
m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
|
182 |
+
m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
|
183 |
+
m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
|
184 |
+
|
185 |
+
m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
|
186 |
+
m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
|
187 |
+
m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
|
188 |
+
|
189 |
+
// Report if buffers grow from last time.
|
190 |
+
size_t sizesTotal = getTotalBufferSizes();
|
191 |
+
if (sizesTotal > m_bufferSizesReported)
|
192 |
+
{
|
193 |
+
size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
|
194 |
+
sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
|
195 |
+
LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
|
196 |
+
m_bufferSizesReported = sizesMB << 20;
|
197 |
+
}
|
198 |
+
|
199 |
+
// Launch stages. Blocks until everything is done.
|
200 |
+
launchStages(instanceMode, peel, stream);
|
201 |
+
|
202 |
+
// Peeling iteration cannot fail, so no point checking things further.
|
203 |
+
if (peel)
|
204 |
+
break;
|
205 |
+
|
206 |
+
// Atomics after coarse stage are now available.
|
207 |
+
CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
|
208 |
+
|
209 |
+
// Success?
|
210 |
+
bool failed = false;
|
211 |
+
for (int i=0; i < m_numImages; i++)
|
212 |
+
{
|
213 |
+
const CRAtomics& a = atomics[i];
|
214 |
+
failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
|
215 |
+
}
|
216 |
+
if (!failed)
|
217 |
+
break; // Success!
|
218 |
+
|
219 |
+
// If we were already at maximum capacity, no can do.
|
220 |
+
if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
|
221 |
+
return false;
|
222 |
+
|
223 |
+
// Enlarge buffers and try again.
|
224 |
+
for (int i=0; i < m_numImages; i++)
|
225 |
+
{
|
226 |
+
const CRAtomics& a = atomics[i];
|
227 |
+
m_maxSubtris = max(m_maxSubtris, min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
|
228 |
+
m_maxBinSegs = max(m_maxBinSegs, a.numBinSegs + maxBinSegsSlack);
|
229 |
+
m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
|
230 |
+
}
|
231 |
+
}
|
232 |
+
|
233 |
+
m_deferredClear = false;
|
234 |
+
return true; // Success.
|
235 |
+
}
|
236 |
+
|
237 |
+
//------------------------------------------------------------------------
|
238 |
+
|
239 |
+
size_t RasterImpl::getTotalBufferSizes(void) const
|
240 |
+
{
|
241 |
+
return
|
242 |
+
m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
|
243 |
+
m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
|
244 |
+
m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
|
245 |
+
m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
|
246 |
+
}
|
247 |
+
|
248 |
+
//------------------------------------------------------------------------
|
249 |
+
|
250 |
+
void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
|
251 |
+
{
|
252 |
+
CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
|
253 |
+
|
254 |
+
// Unless peeling, initialize atomics to mostly zero.
|
255 |
+
CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
|
256 |
+
if (!peel)
|
257 |
+
{
|
258 |
+
memset(atomics, 0, m_numImages * sizeof(CRAtomics));
|
259 |
+
for (int i=0; i < m_numImages; i++)
|
260 |
+
atomics[i].numSubtris = imageParams[i].triCount;
|
261 |
+
}
|
262 |
+
|
263 |
+
// Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
|
264 |
+
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
|
265 |
+
|
266 |
+
// Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
|
267 |
+
if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
|
268 |
+
{
|
269 |
+
int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
|
270 |
+
m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
|
271 |
+
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
|
272 |
+
}
|
273 |
+
|
274 |
+
// Set global parameters.
|
275 |
+
CRParams p;
|
276 |
+
{
|
277 |
+
p.atomics = (CRAtomics*)m_crAtomics.getPtr();
|
278 |
+
p.numImages = m_numImages;
|
279 |
+
p.totalCount = 0; // Only relevant in range mode.
|
280 |
+
p.instanceMode = instanceMode ? 1 : 0;
|
281 |
+
|
282 |
+
p.numVertices = m_numVertices;
|
283 |
+
p.numTriangles = m_numTriangles;
|
284 |
+
p.vertexBuffer = m_vertexPtr;
|
285 |
+
p.indexBuffer = m_indexPtr;
|
286 |
+
|
287 |
+
p.widthPixels = m_sizePixels.x;
|
288 |
+
p.heightPixels = m_sizePixels.y;
|
289 |
+
p.widthPixelsVp = m_sizeVp.x;
|
290 |
+
p.heightPixelsVp = m_sizeVp.y;
|
291 |
+
p.widthBins = m_sizeBins.x;
|
292 |
+
p.heightBins = m_sizeBins.y;
|
293 |
+
p.numBins = m_numBins;
|
294 |
+
|
295 |
+
p.xs = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
|
296 |
+
p.ys = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
|
297 |
+
p.xo = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
|
298 |
+
p.yo = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
|
299 |
+
|
300 |
+
p.widthTiles = m_sizeTiles.x;
|
301 |
+
p.heightTiles = m_sizeTiles.y;
|
302 |
+
p.numTiles = m_numTiles;
|
303 |
+
|
304 |
+
p.renderModeFlags = m_renderModeFlags;
|
305 |
+
p.deferredClear = m_deferredClear ? 1 : 0;
|
306 |
+
p.clearColor = m_clearColor;
|
307 |
+
p.clearDepth = CR_DEPTH_MAX;
|
308 |
+
|
309 |
+
p.maxSubtris = m_maxSubtris;
|
310 |
+
p.maxBinSegs = m_maxBinSegs;
|
311 |
+
p.maxTileSegs = m_maxTileSegs;
|
312 |
+
|
313 |
+
p.triSubtris = m_triSubtris.getPtr();
|
314 |
+
p.triHeader = m_triHeader.getPtr();
|
315 |
+
p.triData = m_triData.getPtr();
|
316 |
+
p.binSegData = m_binSegData.getPtr();
|
317 |
+
p.binSegNext = m_binSegNext.getPtr();
|
318 |
+
p.binSegCount = m_binSegCount.getPtr();
|
319 |
+
p.binFirstSeg = m_binFirstSeg.getPtr();
|
320 |
+
p.binTotal = m_binTotal.getPtr();
|
321 |
+
p.tileSegData = m_tileSegData.getPtr();
|
322 |
+
p.tileSegNext = m_tileSegNext.getPtr();
|
323 |
+
p.tileSegCount = m_tileSegCount.getPtr();
|
324 |
+
p.activeTiles = m_activeTiles.getPtr();
|
325 |
+
p.tileFirstSeg = m_tileFirstSeg.getPtr();
|
326 |
+
|
327 |
+
size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
|
328 |
+
p.colorBuffer = m_colorBuffer.getPtr(byteOffset);
|
329 |
+
p.depthBuffer = m_depthBuffer.getPtr(byteOffset);
|
330 |
+
p.peelBuffer = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
|
331 |
+
p.strideX = m_bufferSizePixels.x;
|
332 |
+
p.strideY = m_bufferSizePixels.y;
|
333 |
+
|
334 |
+
memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
|
335 |
+
p.imageParamsExtra = (CRImageParams*)m_crImageParamsExtra.getPtr();
|
336 |
+
}
|
337 |
+
|
338 |
+
// Setup block sizes.
|
339 |
+
|
340 |
+
dim3 brBlock(32, CR_BIN_WARPS);
|
341 |
+
dim3 crBlock(32, CR_COARSE_WARPS);
|
342 |
+
dim3 frBlock(32, m_numFineWarpsPerBlock);
|
343 |
+
void* args[] = {&p};
|
344 |
+
|
345 |
+
// Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
|
346 |
+
if (!peel)
|
347 |
+
{
|
348 |
+
if (instanceMode)
|
349 |
+
{
|
350 |
+
int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
|
351 |
+
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
|
352 |
+
}
|
353 |
+
else
|
354 |
+
{
|
355 |
+
for (int i=0; i < m_numImages; i++)
|
356 |
+
p.totalCount += imageParams[i].triCount;
|
357 |
+
int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
|
358 |
+
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
|
359 |
+
}
|
360 |
+
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
|
361 |
+
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
|
362 |
+
NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
|
363 |
+
}
|
364 |
+
|
365 |
+
// Fine rasterizer is launched always.
|
366 |
+
NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
|
367 |
+
NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
|
368 |
+
}
|
369 |
+
|
370 |
+
//------------------------------------------------------------------------
|