geyik1 commited on
Commit
01b6284
·
verified ·
1 Parent(s): 226248d

Delete ginipick:SORA-3D

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ginipick:SORA-3D/3d.mp4 +0 -3
  2. ginipick:SORA-3D/README.md +0 -11
  3. ginipick:SORA-3D/app.py +0 -2
  4. ginipick:SORA-3D/assets/.DS_Store +0 -0
  5. ginipick:SORA-3D/assets/example_image/.DS_Store +0 -0
  6. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp +0 -0
  7. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp +0 -0
  8. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp +0 -0
  9. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp +0 -0
  10. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp +0 -0
  11. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp +0 -0
  12. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp +0 -0
  13. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp +0 -0
  14. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp +0 -0
  15. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp +0 -0
  16. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp +0 -0
  17. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp +0 -0
  18. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp +0 -0
  19. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp +0 -0
  20. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp +0 -0
  21. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp +0 -0
  22. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp +0 -0
  23. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp +0 -0
  24. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp +0 -0
  25. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp +0 -0
  26. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp +0 -0
  27. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp +0 -3
  28. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp +0 -0
  29. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp +0 -0
  30. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp +0 -3
  31. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp +0 -0
  32. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp +0 -0
  33. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp +0 -0
  34. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp +0 -0
  35. ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp +0 -0
  36. ginipick:SORA-3D/extensions/.DS_Store +0 -0
  37. ginipick:SORA-3D/extensions/extensions_nvdiffrast_LICENSE.txt +0 -97
  38. ginipick:SORA-3D/extensions/extensions_nvdiffrast_README.md +0 -42
  39. ginipick:SORA-3D/extensions/extensions_nvdiffrast_run_sample.sh +0 -52
  40. ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup copy.py +0 -51
  41. ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup.py +0 -82
  42. ginipick:SORA-3D/extensions/nvdiffrast/.DS_Store +0 -0
  43. ginipick:SORA-3D/extensions/nvdiffrast/common/.DS_Store +0 -0
  44. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/.DS_Store +0 -0
  45. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp +0 -63
  46. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/.DS_Store +0 -0
  47. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl +0 -423
  48. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp +0 -94
  49. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp +0 -55
  50. ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl +0 -730
ginipick:SORA-3D/3d.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c3282465210bac76f44b605956139679ed774c8bad9be686707d1b770961371
3
- size 21309978
 
 
 
 
ginipick:SORA-3D/README.md DELETED
@@ -1,11 +0,0 @@
1
- ---
2
- title: SORA 3D
3
- emoji: 🏢🏆
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 4.44.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: Create top-quality 3D(.GLB) models from text or images
11
- ---
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/app.py DELETED
@@ -1,2 +0,0 @@
1
- import os
2
- exec(os.environ.get('APP'))
 
 
 
ginipick:SORA-3D/assets/.DS_Store DELETED
Binary file (6.15 kB)
 
ginipick:SORA-3D/assets/example_image/.DS_Store DELETED
Binary file (12.3 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp DELETED
Binary file (93 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp DELETED
Binary file (48.5 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp DELETED
Binary file (63.9 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp DELETED
Binary file (20.7 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp DELETED
Binary file (42.9 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp DELETED
Binary file (32.6 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp DELETED
Binary file (41.9 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp DELETED
Binary file (42.4 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp DELETED
Binary file (14 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp DELETED
Binary file (51 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp DELETED
Binary file (33.3 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp DELETED
Binary file (56.1 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp DELETED
Binary file (58.6 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp DELETED
Binary file (11 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp DELETED
Binary file (58.5 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp DELETED
Binary file (47.6 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp DELETED
Binary file (24.7 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp DELETED
Binary file (38.7 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp DELETED
Binary file (49.6 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp DELETED
Binary file (46.2 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp DELETED
Binary file (63.1 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp DELETED

Git LFS Details

  • SHA256: 04a741b7588b46f6f885987fa3330d51f671d7f372eedf3cc007e69fd1a2e3e9
  • Pointer size: 131 Bytes
  • Size of remote file: 113 kB
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp DELETED
Binary file (33.4 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp DELETED
Binary file (95 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp DELETED

Git LFS Details

  • SHA256: f5cbfa61ca24164cafbd695aa6f12b617196a64f913e5c9964fad60a74dedda6
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp DELETED
Binary file (57.5 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp DELETED
Binary file (88.8 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp DELETED
Binary file (19 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp DELETED
Binary file (12.6 kB)
 
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp DELETED
Binary file (8.18 kB)
 
ginipick:SORA-3D/extensions/.DS_Store DELETED
Binary file (6.15 kB)
 
ginipick:SORA-3D/extensions/extensions_nvdiffrast_LICENSE.txt DELETED
@@ -1,97 +0,0 @@
1
- Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
2
-
3
-
4
- Nvidia Source Code License (1-Way Commercial)
5
-
6
- =======================================================================
7
-
8
- 1. Definitions
9
-
10
- "Licensor" means any person or entity that distributes its Work.
11
-
12
- "Software" means the original work of authorship made available under
13
- this License.
14
-
15
- "Work" means the Software and any additions to or derivative works of
16
- the Software that are made available under this License.
17
-
18
- The terms "reproduce," "reproduction," "derivative works," and
19
- "distribution" have the meaning as provided under U.S. copyright law;
20
- provided, however, that for the purposes of this License, derivative
21
- works shall not include works that remain separable from, or merely
22
- link (or bind by name) to the interfaces of, the Work.
23
-
24
- Works, including the Software, are "made available" under this License
25
- by including in or with the Work either (a) a copyright notice
26
- referencing the applicability of this License to the Work, or (b) a
27
- copy of this License.
28
-
29
- 2. License Grants
30
-
31
- 2.1 Copyright Grant. Subject to the terms and conditions of this
32
- License, each Licensor grants to you a perpetual, worldwide,
33
- non-exclusive, royalty-free, copyright license to reproduce,
34
- prepare derivative works of, publicly display, publicly perform,
35
- sublicense and distribute its Work and any resulting derivative
36
- works in any form.
37
-
38
- 3. Limitations
39
-
40
- 3.1 Redistribution. You may reproduce or distribute the Work only
41
- if (a) you do so under this License, (b) you include a complete
42
- copy of this License with your distribution, and (c) you retain
43
- without modification any copyright, patent, trademark, or
44
- attribution notices that are present in the Work.
45
-
46
- 3.2 Derivative Works. You may specify that additional or different
47
- terms apply to the use, reproduction, and distribution of your
48
- derivative works of the Work ("Your Terms") only if (a) Your Terms
49
- provide that the use limitation in Section 3.3 applies to your
50
- derivative works, and (b) you identify the specific derivative
51
- works that are subject to Your Terms. Notwithstanding Your Terms,
52
- this License (including the redistribution requirements in Section
53
- 3.1) will continue to apply to the Work itself.
54
-
55
- 3.3 Use Limitation. The Work and any derivative works thereof only
56
- may be used or intended for use non-commercially. The Work or
57
- derivative works thereof may be used or intended for use by Nvidia
58
- or its affiliates commercially or non-commercially. As used herein,
59
- "non-commercially" means for research or evaluation purposes only
60
- and not for any direct or indirect monetary gain.
61
-
62
- 3.4 Patent Claims. If you bring or threaten to bring a patent claim
63
- against any Licensor (including any claim, cross-claim or
64
- counterclaim in a lawsuit) to enforce any patents that you allege
65
- are infringed by any Work, then your rights under this License from
66
- such Licensor (including the grant in Section 2.1) will terminate
67
- immediately.
68
-
69
- 3.5 Trademarks. This License does not grant any rights to use any
70
- Licensor's or its affiliates' names, logos, or trademarks, except
71
- as necessary to reproduce the notices described in this License.
72
-
73
- 3.6 Termination. If you violate any term of this License, then your
74
- rights under this License (including the grant in Section 2.1) will
75
- terminate immediately.
76
-
77
- 4. Disclaimer of Warranty.
78
-
79
- THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
80
- KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
81
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
82
- NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
83
- THIS LICENSE.
84
-
85
- 5. Limitation of Liability.
86
-
87
- EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
88
- THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
89
- SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
90
- INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
91
- OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
92
- (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
93
- LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
94
- COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
95
- THE POSSIBILITY OF SUCH DAMAGES.
96
-
97
- =======================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/extensions_nvdiffrast_README.md DELETED
@@ -1,42 +0,0 @@
1
- ## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
2
-
3
- ![Teaser image](./docs/img/teaser.png)
4
-
5
- **Modular Primitives for High-Performance Differentiable Rendering**<br>
6
- Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
7
- [http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
8
-
9
- Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
10
- Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
11
-
12
- ## Licenses
13
-
14
- Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
15
-
16
- This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
17
-
18
- For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
19
-
20
- We do not currently accept outside code contributions in the form of pull requests.
21
-
22
- Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
23
- [sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
24
- originally shared under
25
- [MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
26
- Mesh and texture stored as part of `samples/data/earth.npz` are derived from
27
- [3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
28
- model originally made available under
29
- [TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
30
-
31
- ## Citation
32
-
33
- ```
34
- @article{Laine2020diffrast,
35
- title = {Modular Primitives for High-Performance Differentiable Rendering},
36
- author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
37
- journal = {ACM Transactions on Graphics},
38
- year = {2020},
39
- volume = {39},
40
- number = {6}
41
- }
42
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/extensions_nvdiffrast_run_sample.sh DELETED
@@ -1,52 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4
- #
5
- # NVIDIA CORPORATION and its licensors retain all intellectual property
6
- # and proprietary rights in and to this software, related documentation
7
- # and any modifications thereto. Any use, reproduction, disclosure or
8
- # distribution of this software and related documentation without an express
9
- # license agreement from NVIDIA CORPORATION is strictly prohibited.
10
-
11
- function print_help {
12
- echo "Usage: `basename $0` [--build-container] <python_file>"
13
- echo ""
14
- echo "Option --build-container will build the Docker container based on"
15
- echo "docker/Dockerfile and tag the image with gltorch:latest."
16
- echo ""
17
- echo "Example: `basename $0` samples/torch/envphong.py"
18
- }
19
-
20
- build_container=0
21
- sample=""
22
- while [[ "$#" -gt 0 ]]; do
23
- case $1 in
24
- --build-container) build_container=1;;
25
- -h|--help) print_help; exit 0 ;;
26
- --*) echo "Unknown parameter passed: $1"; exit 1 ;;
27
- *) sample="$1"; shift; break;
28
- esac
29
- shift
30
- done
31
-
32
- rest=$@
33
-
34
- # Build the docker container
35
- if [ "$build_container" = "1" ]; then
36
- docker build --tag gltorch:latest -f docker/Dockerfile .
37
- fi
38
-
39
- if [ ! -f "$sample" ]; then
40
- echo
41
- echo "No python sample given or file '$sample' not found. Exiting."
42
- exit 1
43
- fi
44
-
45
- image="gltorch:latest"
46
-
47
- echo "Using container image: $image"
48
- echo "Running command: $sample $rest"
49
-
50
- # Run a sample with docker
51
- docker run --rm -it --gpus all --user $(id -u):$(id -g) \
52
- -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup copy.py DELETED
@@ -1,51 +0,0 @@
1
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- #
3
- # NVIDIA CORPORATION and its licensors retain all intellectual property
4
- # and proprietary rights in and to this software, related documentation
5
- # and any modifications thereto. Any use, reproduction, disclosure or
6
- # distribution of this software and related documentation without an express
7
- # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- import nvdiffrast
10
- import setuptools
11
- import os
12
-
13
- with open("README.md", "r") as fh:
14
- long_description = fh.read()
15
-
16
- setuptools.setup(
17
- name="nvdiffrast",
18
- version=nvdiffrast.__version__,
19
- author="Samuli Laine",
20
- author_email="slaine@nvidia.com",
21
- description="nvdiffrast - modular primitives for high-performance differentiable rendering",
22
- long_description=long_description,
23
- long_description_content_type="text/markdown",
24
- url="https://github.com/NVlabs/nvdiffrast",
25
- packages=setuptools.find_packages(),
26
- package_data={
27
- 'nvdiffrast': [
28
- 'common/*.h',
29
- 'common/*.inl',
30
- 'common/*.cu',
31
- 'common/*.cpp',
32
- 'common/cudaraster/*.hpp',
33
- 'common/cudaraster/impl/*.cpp',
34
- 'common/cudaraster/impl/*.hpp',
35
- 'common/cudaraster/impl/*.inl',
36
- 'common/cudaraster/impl/*.cu',
37
- 'lib/*.h',
38
- 'torch/*.h',
39
- 'torch/*.inl',
40
- 'torch/*.cpp',
41
- 'tensorflow/*.cu',
42
- ] + (['lib/*.lib'] if os.name == 'nt' else [])
43
- },
44
- include_package_data=True,
45
- install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
46
- classifiers=[
47
- "Programming Language :: Python :: 3",
48
- "Operating System :: OS Independent",
49
- ],
50
- python_requires='>=3.6',
51
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup.py DELETED
@@ -1,82 +0,0 @@
1
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
- #
3
- # NVIDIA CORPORATION and its licensors retain all intellectual property
4
- # and proprietary rights in and to this software, related documentation
5
- # and any modifications thereto. Any use, reproduction, disclosure or
6
- # distribution of this software and related documentation without an express
7
- # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- import nvdiffrast
10
- import setuptools
11
- import os
12
- from torch.utils.cpp_extension import CUDAExtension, BuildExtension
13
-
14
-
15
- with open("README.md", "r") as fh:
16
- long_description = fh.read()
17
-
18
- setuptools.setup(
19
- name="nvdiffrast",
20
- version=nvdiffrast.__version__,
21
- author="Samuli Laine",
22
- author_email="slaine@nvidia.com",
23
- description="nvdiffrast - modular primitives for high-performance differentiable rendering",
24
- long_description=long_description,
25
- long_description_content_type="text/markdown",
26
- url="https://github.com/NVlabs/nvdiffrast",
27
- packages=setuptools.find_packages(),
28
- # package_data={
29
- # 'nvdiffrast': [
30
- # 'common/*.h',
31
- # 'common/*.inl',
32
- # 'common/*.cu',
33
- # 'common/*.cpp',
34
- # 'common/cudaraster/*.hpp',
35
- # 'common/cudaraster/impl/*.cpp',
36
- # 'common/cudaraster/impl/*.hpp',
37
- # 'common/cudaraster/impl/*.inl',
38
- # 'common/cudaraster/impl/*.cu',
39
- # 'lib/*.h',
40
- # 'torch/*.h',
41
- # 'torch/*.inl',
42
- # 'torch/*.cpp',
43
- # 'tensorflow/*.cu',
44
- # ] + (['lib/*.lib'] if os.name == 'nt' else [])
45
- # },
46
- # include_package_data=True,
47
- ext_modules=[
48
- CUDAExtension(
49
- name="nvdiffrast.torch._C",
50
- sources=[
51
- 'nvdiffrast/common/cudaraster/impl/Buffer.cpp',
52
- 'nvdiffrast/common/cudaraster/impl/CudaRaster.cpp',
53
- 'nvdiffrast/common/cudaraster/impl/RasterImpl_.cu',
54
- 'nvdiffrast/common/cudaraster/impl/RasterImpl.cpp',
55
- 'nvdiffrast/common/common.cpp',
56
- 'nvdiffrast/common/rasterize.cu',
57
- 'nvdiffrast/common/interpolate.cu',
58
- 'nvdiffrast/common/texture_.cu',
59
- 'nvdiffrast/common/texture.cpp',
60
- 'nvdiffrast/common/antialias.cu',
61
- 'nvdiffrast/torch/torch_bindings.cpp',
62
- 'nvdiffrast/torch/torch_rasterize.cpp',
63
- 'nvdiffrast/torch/torch_interpolate.cpp',
64
- 'nvdiffrast/torch/torch_texture.cpp',
65
- 'nvdiffrast/torch/torch_antialias.cpp',
66
- ],
67
- extra_compile_args={
68
- 'cxx': ['-DNVDR_TORCH'],
69
- 'nvcc': ['-DNVDR_TORCH', '-lineinfo'],
70
- },
71
- )
72
- ],
73
- cmdclass={
74
- 'build_ext': BuildExtension
75
- },
76
- install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
77
- classifiers=[
78
- "Programming Language :: Python :: 3",
79
- "Operating System :: OS Independent",
80
- ],
81
- python_requires='>=3.6',
82
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/nvdiffrast/.DS_Store DELETED
Binary file (8.2 kB)
 
ginipick:SORA-3D/extensions/nvdiffrast/common/.DS_Store DELETED
Binary file (10.2 kB)
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/.DS_Store DELETED
Binary file (6.15 kB)
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp DELETED
@@ -1,63 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
-
11
- //------------------------------------------------------------------------
12
- // This is a slimmed-down and modernized version of the original
13
- // CudaRaster codebase that accompanied the HPG 2011 paper
14
- // "High-Performance Software Rasterization on GPUs" by Laine and Karras.
15
- // Modifications have been made to accommodate post-Volta execution model
16
- // with warp divergence. Support for shading, blending, quad rendering,
17
- // and supersampling have been removed as unnecessary for nvdiffrast.
18
- //------------------------------------------------------------------------
19
-
20
- namespace CR
21
- {
22
-
23
- class RasterImpl;
24
-
25
- //------------------------------------------------------------------------
26
- // Interface class to isolate user from implementation details.
27
- //------------------------------------------------------------------------
28
-
29
- class CudaRaster
30
- {
31
- public:
32
- enum
33
- {
34
- RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
35
- RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
36
- };
37
-
38
- public:
39
- CudaRaster (void);
40
- ~CudaRaster (void);
41
-
42
- void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
43
- void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
44
- void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
45
- void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
46
- void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
47
- void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
48
- bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
49
- void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
50
- void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
51
- void swapDepthAndPeel (void); // Swap depth and peeling buffers.
52
-
53
- private:
54
- CudaRaster (const CudaRaster&); // forbidden
55
- CudaRaster& operator= (const CudaRaster&); // forbidden
56
-
57
- private:
58
- RasterImpl* m_impl; // Opaque pointer to implementation.
59
- };
60
-
61
- //------------------------------------------------------------------------
62
- } // namespace CR
63
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/.DS_Store DELETED
Binary file (10.2 kB)
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl DELETED
@@ -1,423 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
-
11
- __device__ __inline__ void binRasterImpl(const CRParams p)
12
- {
13
- __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
14
- __shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
15
- __shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
16
- __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
17
- __shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
18
- __shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
19
- __shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
20
- __shared__ volatile U32 s_batchPos;
21
- __shared__ volatile U32 s_bufCount;
22
- __shared__ volatile U32 s_overTotal;
23
- __shared__ volatile U32 s_allocBase;
24
-
25
- const CRImageParams& ip = getImageParams(p, blockIdx.z);
26
- CRAtomics& atomics = p.atomics[blockIdx.z];
27
- const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
28
- const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
29
-
30
- S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
31
- S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
32
- S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
33
- S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
34
- S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
35
-
36
- if (atomics.numSubtris > p.maxSubtris)
37
- return;
38
-
39
- // per-thread state
40
- int thrInBlock = threadIdx.x + threadIdx.y * 32;
41
- int batchPos = 0;
42
-
43
- // first 16 elements of s_broadcast are always zero
44
- if (thrInBlock < 16)
45
- s_broadcast[thrInBlock] = 0;
46
-
47
- // initialize output linked lists and offsets
48
- if (thrInBlock < p.numBins)
49
- {
50
- binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
51
- s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
52
- s_outTotal[thrInBlock] = 0;
53
- }
54
-
55
- // repeat until done
56
- for(;;)
57
- {
58
- // get batch
59
- if (thrInBlock == 0)
60
- s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
61
- __syncthreads();
62
- batchPos = s_batchPos;
63
-
64
- // all batches done?
65
- if (batchPos >= ip.triCount)
66
- break;
67
-
68
- // per-thread state
69
- int bufIndex = 0;
70
- int bufCount = 0;
71
- int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
72
-
73
- // loop over batch as long as we have triangles in it
74
- do
75
- {
76
- // read more triangles
77
- while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
78
- {
79
- // get subtriangle count
80
-
81
- int triIdx = batchPos + thrInBlock;
82
- int num = 0;
83
- if (triIdx < batchEnd)
84
- num = triSubtris[triIdx];
85
-
86
- // cumulative sum of subtriangles within each warp
87
- U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
88
- if (__any_sync(~0u, num > 1))
89
- {
90
- myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
91
- myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
92
- }
93
- if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
94
- s_broadcast[threadIdx.y + 16] = myIdx + num;
95
- __syncthreads();
96
-
97
- // cumulative sum of per-warp subtriangle counts
98
- // Note: cannot have more than 32 warps or this needs to sync between each step.
99
- bool act = (thrInBlock < CR_BIN_WARPS);
100
- U32 actMask = __ballot_sync(~0u, act);
101
- if (threadIdx.y == 0 && act)
102
- {
103
- volatile U32* ptr = &s_broadcast[thrInBlock + 16];
104
- U32 val = *ptr;
105
- #if (CR_BIN_WARPS > 1)
106
- val += ptr[-1]; __syncwarp(actMask);
107
- *ptr = val; __syncwarp(actMask);
108
- #endif
109
- #if (CR_BIN_WARPS > 2)
110
- val += ptr[-2]; __syncwarp(actMask);
111
- *ptr = val; __syncwarp(actMask);
112
- #endif
113
- #if (CR_BIN_WARPS > 4)
114
- val += ptr[-4]; __syncwarp(actMask);
115
- *ptr = val; __syncwarp(actMask);
116
- #endif
117
- #if (CR_BIN_WARPS > 8)
118
- val += ptr[-8]; __syncwarp(actMask);
119
- *ptr = val; __syncwarp(actMask);
120
- #endif
121
- #if (CR_BIN_WARPS > 16)
122
- val += ptr[-16]; __syncwarp(actMask);
123
- *ptr = val; __syncwarp(actMask);
124
- #endif
125
-
126
- // initially assume that we consume everything
127
- // only last active thread does the writes
128
- if (threadIdx.x == CR_BIN_WARPS - 1)
129
- {
130
- s_batchPos = batchPos + CR_BIN_WARPS * 32;
131
- s_bufCount = bufCount + val;
132
- }
133
- }
134
- __syncthreads();
135
-
136
- // skip if no subtriangles
137
- if (num)
138
- {
139
- // calculate write position for first subtriangle
140
- U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
141
-
142
- // only write if entire triangle fits
143
- if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
144
- {
145
- pos += bufIndex; // adjust for current start position
146
- pos &= CR_ARRAY_SIZE(s_triBuf)-1;
147
- if (num == 1)
148
- s_triBuf[pos] = triIdx * 8 + 7; // single triangle
149
- else
150
- {
151
- for (int i=0; i < num; i++)
152
- {
153
- s_triBuf[pos] = triIdx * 8 + i;
154
- pos++;
155
- pos &= CR_ARRAY_SIZE(s_triBuf)-1;
156
- }
157
- }
158
- } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
159
- {
160
- // this triangle is the first that failed, overwrite total count and triangle count
161
- s_batchPos = batchPos + thrInBlock;
162
- s_bufCount = pos;
163
- }
164
- }
165
-
166
- // update triangle counts
167
- __syncthreads();
168
- batchPos = s_batchPos;
169
- bufCount = s_bufCount;
170
- }
171
-
172
- // make every warp clear its output buffers
173
- for (int i=threadIdx.x; i < p.numBins; i += 32)
174
- s_outMask[threadIdx.y][i] = 0;
175
- __syncwarp();
176
-
177
- // choose our triangle
178
- uint4 triData = make_uint4(0, 0, 0, 0);
179
- if (thrInBlock < bufCount)
180
- {
181
- U32 triPos = bufIndex + thrInBlock;
182
- triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
183
-
184
- // find triangle
185
- int triIdx = s_triBuf[triPos];
186
- int dataIdx = triIdx >> 3;
187
- int subtriIdx = triIdx & 7;
188
- if (subtriIdx != 7)
189
- dataIdx = triHeader[dataIdx].misc + subtriIdx;
190
-
191
- // read triangle
192
-
193
- triData = *(((const uint4*)triHeader) + dataIdx);
194
- }
195
-
196
- // setup bounding box and edge functions, and rasterize
197
- S32 lox, loy, hix, hiy;
198
- bool hasTri = (thrInBlock < bufCount);
199
- U32 hasTriMask = __ballot_sync(~0u, hasTri);
200
- if (hasTri)
201
- {
202
- S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
203
- S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
204
- S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
205
- S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
206
- S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
207
- S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
208
- int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
209
- lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
210
- loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
211
- hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
212
- hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
213
-
214
- U32 bit = 1 << threadIdx.x;
215
- #if __CUDA_ARCH__ >= 700
216
- bool multi = (hix != lox || hiy != loy);
217
- if (!__any_sync(hasTriMask, multi))
218
- {
219
- int binIdx = lox + p.widthBins * loy;
220
- U32 mask = __match_any_sync(hasTriMask, binIdx);
221
- s_outMask[threadIdx.y][binIdx] = mask;
222
- __syncwarp(hasTriMask);
223
- } else
224
- #endif
225
- {
226
- bool complex = (hix > lox+1 || hiy > loy+1);
227
- if (!__any_sync(hasTriMask, complex))
228
- {
229
- int binIdx = lox + p.widthBins * loy;
230
- atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
231
- if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
232
- if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
233
- if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
234
- } else
235
- {
236
- S32 d12x = d02x - d01x, d12y = d02y - d01y;
237
- v0x -= lox << binLog, v0y -= loy << binLog;
238
-
239
- S32 t01 = v0x * d01y - v0y * d01x;
240
- S32 t02 = v0y * d02x - v0x * d02y;
241
- S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
242
- S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
243
- S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
244
- S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
245
-
246
- int width = hix - lox + 1;
247
- d01x += width * d01y;
248
- d02x += width * d02y;
249
- d12x += width * d12y;
250
-
251
- U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
252
- U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
253
- U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
254
- int stride = p.widthBins * 4;
255
- int ptrYInc = stride - width * 4;
256
-
257
- do
258
- {
259
- if (b01 >= 0 && b02 >= 0 && b12 >= 0)
260
- atomicOr((U32*)currPtr, bit);
261
- currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
262
- if (currPtr == skipPtr)
263
- currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
264
- }
265
- while (currPtr != endPtr);
266
- }
267
- }
268
- }
269
-
270
- // count per-bin contributions
271
- if (thrInBlock == 0)
272
- s_overTotal = 0; // overflow counter
273
-
274
- // ensure that out masks are done
275
- __syncthreads();
276
-
277
- int overIndex = -1;
278
- bool act = (thrInBlock < p.numBins);
279
- U32 actMask = __ballot_sync(~0u, act);
280
- if (act)
281
- {
282
- U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
283
- U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
284
- int total = 0;
285
- for (int i = 0; i < CR_BIN_WARPS; i++)
286
- {
287
- total += __popc(*(U32*)srcPtr);
288
- *(U32*)dstPtr = total;
289
- srcPtr += (CR_MAXBINS_SQR + 1) * 4;
290
- dstPtr += (CR_MAXBINS_SQR + 1) * 4;
291
- }
292
-
293
- // overflow => request a new segment
294
- int ofs = s_outOfs[thrInBlock];
295
- bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
296
- U32 ovrMask = __ballot_sync(actMask, ovr);
297
- if (ovr)
298
- {
299
- overIndex = __popc(ovrMask & getLaneMaskLt());
300
- if (overIndex == 0)
301
- s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
302
- __syncwarp(ovrMask);
303
- overIndex += s_broadcast[threadIdx.y + 16];
304
- s_overIndex[thrInBlock] = overIndex;
305
- }
306
- }
307
-
308
- // sync after overTotal is ready
309
- __syncthreads();
310
-
311
- // at least one segment overflowed => allocate segments
312
- U32 overTotal = s_overTotal;
313
- U32 allocBase = 0;
314
- if (overTotal > 0)
315
- {
316
- // allocate memory
317
- if (thrInBlock == 0)
318
- {
319
- U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
320
- s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
321
- }
322
- __syncthreads();
323
- allocBase = s_allocBase;
324
-
325
- // did my bin overflow?
326
- if (overIndex != -1)
327
- {
328
- // calculate new segment index
329
- int segIdx = allocBase + overIndex;
330
-
331
- // add to linked list
332
- if (s_outOfs[thrInBlock] < 0)
333
- binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
334
- else
335
- binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
336
-
337
- // defaults
338
- binSegNext [segIdx] = -1;
339
- binSegCount[segIdx] = CR_BIN_SEG_SIZE;
340
- }
341
- }
342
-
343
- // concurrent emission -- each warp handles its own triangle
344
- if (thrInBlock < bufCount)
345
- {
346
- int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
347
- int currBin = lox + loy * p.widthBins;
348
- int skipBin = (hix + 1) + loy * p.widthBins;
349
- int endBin = lox + (hiy + 1) * p.widthBins;
350
- int binYInc = p.widthBins - (hix - lox + 1);
351
-
352
- // loop over triangle's bins
353
- do
354
- {
355
- U32 outMask = s_outMask[threadIdx.y][currBin];
356
- if (outMask & (1<<threadIdx.x))
357
- {
358
- int idx = __popc(outMask & getLaneMaskLt());
359
- if (threadIdx.y > 0)
360
- idx += s_outCount[threadIdx.y-1][currBin];
361
-
362
- int base = s_outOfs[currBin];
363
- int free = (-base) & (CR_BIN_SEG_SIZE - 1);
364
- if (idx >= free)
365
- idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
366
- else
367
- idx += base;
368
-
369
- binSegData[idx] = s_triBuf[triPos];
370
- }
371
-
372
- currBin++;
373
- if (currBin == skipBin)
374
- currBin += binYInc, skipBin += p.widthBins;
375
- }
376
- while (currBin != endBin);
377
- }
378
-
379
- // wait all triangles to finish, then replace overflown segment offsets
380
- __syncthreads();
381
- if (thrInBlock < p.numBins)
382
- {
383
- U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
384
- U32 oldOfs = s_outOfs[thrInBlock];
385
- if (overIndex == -1)
386
- s_outOfs[thrInBlock] = oldOfs + total;
387
- else
388
- {
389
- int addr = oldOfs + total;
390
- addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
391
- addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
392
- s_outOfs[thrInBlock] = addr;
393
- }
394
- s_outTotal[thrInBlock] += total;
395
- }
396
-
397
- // these triangles are now done
398
- int count = ::min(bufCount, CR_BIN_WARPS * 32);
399
- bufCount -= count;
400
- bufIndex += count;
401
- bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
402
- }
403
- while (bufCount > 0 || batchPos < batchEnd);
404
-
405
- // flush all bins
406
- if (thrInBlock < p.numBins)
407
- {
408
- int ofs = s_outOfs[thrInBlock];
409
- if (ofs & (CR_BIN_SEG_SIZE-1))
410
- {
411
- int seg = ofs >> CR_BIN_SEG_LOG2;
412
- binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
413
- s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
414
- }
415
- }
416
- }
417
-
418
- // output totals
419
- if (thrInBlock < p.numBins)
420
- binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
421
- }
422
-
423
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp DELETED
@@ -1,94 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #include "../../framework.h"
10
- #include "Buffer.hpp"
11
-
12
- using namespace CR;
13
-
14
- //------------------------------------------------------------------------
15
- // GPU buffer.
16
- //------------------------------------------------------------------------
17
-
18
- Buffer::Buffer(void)
19
- : m_gpuPtr(NULL),
20
- m_bytes (0)
21
- {
22
- // empty
23
- }
24
-
25
- Buffer::~Buffer(void)
26
- {
27
- if (m_gpuPtr)
28
- cudaFree(m_gpuPtr); // Don't throw an exception.
29
- }
30
-
31
- void Buffer::reset(size_t bytes)
32
- {
33
- if (bytes == m_bytes)
34
- return;
35
-
36
- if (m_gpuPtr)
37
- {
38
- NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
39
- m_gpuPtr = NULL;
40
- }
41
-
42
- if (bytes > 0)
43
- NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
44
-
45
- m_bytes = bytes;
46
- }
47
-
48
- void Buffer::grow(size_t bytes)
49
- {
50
- if (bytes > m_bytes)
51
- reset(bytes);
52
- }
53
-
54
- //------------------------------------------------------------------------
55
- // Host buffer with page-locked memory.
56
- //------------------------------------------------------------------------
57
-
58
- HostBuffer::HostBuffer(void)
59
- : m_hostPtr(NULL),
60
- m_bytes (0)
61
- {
62
- // empty
63
- }
64
-
65
- HostBuffer::~HostBuffer(void)
66
- {
67
- if (m_hostPtr)
68
- cudaFreeHost(m_hostPtr); // Don't throw an exception.
69
- }
70
-
71
- void HostBuffer::reset(size_t bytes)
72
- {
73
- if (bytes == m_bytes)
74
- return;
75
-
76
- if (m_hostPtr)
77
- {
78
- NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
79
- m_hostPtr = NULL;
80
- }
81
-
82
- if (bytes > 0)
83
- NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
84
-
85
- m_bytes = bytes;
86
- }
87
-
88
- void HostBuffer::grow(size_t bytes)
89
- {
90
- if (bytes > m_bytes)
91
- reset(bytes);
92
- }
93
-
94
- //------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp DELETED
@@ -1,55 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- #pragma once
10
- #include "Defs.hpp"
11
-
12
- namespace CR
13
- {
14
- //------------------------------------------------------------------------
15
-
16
- class Buffer
17
- {
18
- public:
19
- Buffer (void);
20
- ~Buffer (void);
21
-
22
- void reset (size_t bytes);
23
- void grow (size_t bytes);
24
- void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
25
- size_t getSize (void) const { return m_bytes; }
26
-
27
- void setPtr (void* ptr) { m_gpuPtr = ptr; }
28
-
29
- private:
30
- void* m_gpuPtr;
31
- size_t m_bytes;
32
- };
33
-
34
- //------------------------------------------------------------------------
35
-
36
- class HostBuffer
37
- {
38
- public:
39
- HostBuffer (void);
40
- ~HostBuffer (void);
41
-
42
- void reset (size_t bytes);
43
- void grow (size_t bytes);
44
- void* getPtr (void) { return m_hostPtr; }
45
- size_t getSize (void) const { return m_bytes; }
46
-
47
- void setPtr (void* ptr) { m_hostPtr = ptr; }
48
-
49
- private:
50
- void* m_hostPtr;
51
- size_t m_bytes;
52
- };
53
-
54
- //------------------------------------------------------------------------
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl DELETED
@@ -1,730 +0,0 @@
1
- // Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
2
- //
3
- // NVIDIA CORPORATION and its licensors retain all intellectual property
4
- // and proprietary rights in and to this software, related documentation
5
- // and any modifications thereto. Any use, reproduction, disclosure or
6
- // distribution of this software and related documentation without an express
7
- // license agreement from NVIDIA CORPORATION is strictly prohibited.
8
-
9
- //------------------------------------------------------------------------
10
-
11
- __device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
12
- {
13
- int tileX = tileInBin & (CR_BIN_SIZE - 1);
14
- int tileY = tileInBin >> CR_BIN_LOG2;
15
- return tileX + tileY * widthTiles;
16
- }
17
-
18
- //------------------------------------------------------------------------
19
-
20
- __device__ __inline__ void coarseRasterImpl(const CRParams p)
21
- {
22
- // Common.
23
-
24
- __shared__ volatile U32 s_workCounter;
25
- __shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
26
-
27
- // Input.
28
-
29
- __shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
30
- __shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
31
- __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
32
- __shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
33
- __shared__ volatile S32 s_triQueueWritePos;
34
- __shared__ volatile U32 s_binStreamSelectedOfs;
35
- __shared__ volatile U32 s_binStreamSelectedSize;
36
-
37
- // Output.
38
-
39
- __shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
40
- __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
41
- __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
42
- __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
43
- __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
44
- __shared__ volatile U32 s_firstAllocSeg;
45
- __shared__ volatile U32 s_firstActiveIdx;
46
-
47
- // Pointers and constants.
48
-
49
- CRAtomics& atomics = p.atomics[blockIdx.z];
50
- const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
51
- const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
52
- const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
53
- const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
54
- const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
55
- const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
56
- S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
57
- S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
58
- S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
59
- S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
60
- S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
61
-
62
- int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
63
- int thrInBlock = threadIdx.x + threadIdx.y * 32;
64
- int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
65
-
66
- if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
67
- return;
68
-
69
- // Initialize sharedmem arrays.
70
-
71
- if (thrInBlock == 0)
72
- {
73
- s_tileEmitPrefixSum[0] = 0;
74
- s_tileAllocPrefixSum[0] = 0;
75
- }
76
- s_scanTemp[threadIdx.y][threadIdx.x] = 0;
77
-
78
- // Sort bins in descending order of triangle count.
79
-
80
- for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
81
- {
82
- int count = 0;
83
- for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
84
- count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
85
- s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
86
- }
87
-
88
- __syncthreads();
89
- sortShared(s_binOrder, p.numBins);
90
-
91
- // Process each bin by one block.
92
-
93
- for (;;)
94
- {
95
- // Pick a bin for the block.
96
-
97
- if (thrInBlock == 0)
98
- s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
99
- __syncthreads();
100
-
101
- int workCounter = s_workCounter;
102
- if (workCounter >= p.numBins)
103
- break;
104
-
105
- U32 binOrder = s_binOrder[workCounter];
106
- bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
107
- if (binEmpty && !p.deferredClear)
108
- break;
109
-
110
- int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
111
-
112
- // Initialize input/output streams.
113
-
114
- int triQueueWritePos = 0;
115
- int triQueueReadPos = 0;
116
-
117
- if (thrInBlock < CR_BIN_STREAMS_SIZE)
118
- {
119
- int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
120
- s_binStreamCurrSeg[thrInBlock] = segIdx;
121
- s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
122
- }
123
-
124
- for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
125
- s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
126
-
127
- // Initialize per-bin state.
128
-
129
- int binY = idiv_fast(binIdx, p.widthBins);
130
- int binX = binIdx - binY * p.widthBins;
131
- int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
132
- int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
133
- int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
134
- int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
135
- int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
136
-
137
- // Entire block: Merge input streams and process triangles.
138
-
139
- if (!binEmpty)
140
- do
141
- {
142
- //------------------------------------------------------------------------
143
- // Merge.
144
- //------------------------------------------------------------------------
145
-
146
- // Entire block: Not enough triangles => merge and queue segments.
147
- // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
148
-
149
- while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
150
- {
151
- // First warp: Choose the segment with the lowest initial triangle index.
152
-
153
- bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
154
- U32 hasStreamMask = __ballot_sync(~0u, hasStream);
155
- if (hasStream)
156
- {
157
- // Find the stream with the lowest triangle index.
158
-
159
- U32 firstTri = s_binStreamFirstTri[thrInBlock];
160
- U32 t = firstTri;
161
- volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
162
-
163
- #if (CR_BIN_STREAMS_SIZE > 1)
164
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
165
- #endif
166
- #if (CR_BIN_STREAMS_SIZE > 2)
167
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
168
- #endif
169
- #if (CR_BIN_STREAMS_SIZE > 4)
170
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
171
- #endif
172
- #if (CR_BIN_STREAMS_SIZE > 8)
173
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
174
- #endif
175
- #if (CR_BIN_STREAMS_SIZE > 16)
176
- v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
177
- #endif
178
- v[0] = t; __syncwarp(hasStreamMask);
179
-
180
- // Consume and broadcast.
181
-
182
- bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
183
- U32 firstMask = __ballot_sync(hasStreamMask, first);
184
- if (first && (firstMask >> threadIdx.x) == 1u)
185
- {
186
- int segIdx = s_binStreamCurrSeg[thrInBlock];
187
- s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
188
- if (segIdx != -1)
189
- {
190
- int segSize = binSegCount[segIdx];
191
- int segNext = binSegNext[segIdx];
192
- s_binStreamSelectedSize = segSize;
193
- s_triQueueWritePos = triQueueWritePos + segSize;
194
- s_binStreamCurrSeg[thrInBlock] = segNext;
195
- s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
196
- }
197
- }
198
- }
199
-
200
- // No more segments => break.
201
-
202
- __syncthreads();
203
- triQueueWritePos = s_triQueueWritePos;
204
- int segOfs = s_binStreamSelectedOfs;
205
- if (segOfs < 0)
206
- break;
207
-
208
- int segSize = s_binStreamSelectedSize;
209
- __syncthreads();
210
-
211
- // Fetch triangles into the queue.
212
-
213
- for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
214
- {
215
- S32 triIdx = binSegData[segOfs + idxInSeg];
216
- s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
217
- }
218
- }
219
-
220
- // All threads: Clear emit masks.
221
-
222
- for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
223
- s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
224
-
225
- __syncthreads();
226
-
227
- //------------------------------------------------------------------------
228
- // Raster.
229
- //------------------------------------------------------------------------
230
-
231
- // Triangle per thread: Read from the queue.
232
-
233
- int triIdx = -1;
234
- if (triQueueReadPos + thrInBlock < triQueueWritePos)
235
- triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
236
-
237
- uint4 triData = make_uint4(0, 0, 0, 0);
238
- if (triIdx != -1)
239
- {
240
- int dataIdx = triIdx >> 3;
241
- int subtriIdx = triIdx & 7;
242
- if (subtriIdx != 7)
243
- dataIdx = triHeader[dataIdx].misc + subtriIdx;
244
- triData = *((uint4*)triHeader + dataIdx);
245
- }
246
-
247
- // 32 triangles per warp: Record emits (= tile intersections).
248
-
249
- if (__any_sync(~0u, triIdx != -1))
250
- {
251
- S32 v0x = sub_s16lo_s16lo(triData.x, originX);
252
- S32 v0y = sub_s16hi_s16lo(triData.x, originY);
253
- S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
254
- S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
255
- S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
256
- S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
257
-
258
- // Compute tile-based AABB.
259
-
260
- int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
261
- int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
262
- int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
263
- int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
264
- int sizex = add_sub(hix, 1, lox);
265
- int sizey = add_sub(hiy, 1, loy);
266
- int area = sizex * sizey;
267
-
268
- // Miscellaneous init.
269
-
270
- U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
271
- int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
272
- U32 maskBit = 1 << threadIdx.x;
273
-
274
- // Case A: All AABBs are small => record the full AABB using atomics.
275
-
276
- if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
277
- {
278
- if (triIdx != -1)
279
- {
280
- atomicOr((U32*)currPtr, maskBit);
281
- if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
282
- if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
283
- if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
284
- }
285
- }
286
- else
287
- {
288
- // Compute warp-AABB (scan-32).
289
-
290
- U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
291
- if (triIdx == -1)
292
- aabbMask = 0;
293
-
294
- volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
295
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
296
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
297
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
298
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
299
- v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
300
- v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
301
-
302
- U32 maskX = aabbMask & 0xFFFF;
303
- U32 maskY = aabbMask >> 16;
304
- int wlox = findLeadingOne(maskX ^ (maskX - 1));
305
- int wloy = findLeadingOne(maskY ^ (maskY - 1));
306
- int whix = findLeadingOne(maskX);
307
- int whiy = findLeadingOne(maskY);
308
- int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
309
-
310
- // Initialize edge functions.
311
-
312
- S32 d12x = d02x - d01x;
313
- S32 d12y = d02y - d01y;
314
- v0x -= lox << tileLog;
315
- v0y -= loy << tileLog;
316
-
317
- S32 t01 = v0x * d01y - v0y * d01x;
318
- S32 t02 = v0y * d02x - v0x * d02y;
319
- S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
320
- S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
321
- S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
322
- S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
323
-
324
- d01x += sizex * d01y;
325
- d02x += sizex * d02y;
326
- d12x += sizex * d12y;
327
-
328
- // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
329
- if (__any_sync(~0u, warea * 4 <= area * 8))
330
- {
331
- // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
332
- bool act = (triIdx != -1);
333
- U32 actMask = __ballot_sync(~0u, act);
334
- if (act)
335
- {
336
- for (int y = wloy; y <= whiy; y++)
337
- {
338
- bool yIn = (y >= loy && y <= hiy);
339
- U32 yMask = __ballot_sync(actMask, yIn);
340
- if (yIn)
341
- {
342
- for (int x = wlox; x <= whix; x++)
343
- {
344
- bool xyIn = (x >= lox && x <= hix);
345
- U32 xyMask = __ballot_sync(yMask, xyIn);
346
- if (xyIn)
347
- {
348
- U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
349
- if (threadIdx.x == 31 - __clz(xyMask))
350
- *(U32*)currPtr = res;
351
- currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
352
- }
353
- }
354
- currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
355
- }
356
- }
357
- }
358
- }
359
-
360
- // Case C: General case => Check tiles in AABB, record using atomics.
361
-
362
- else
363
- {
364
- if (triIdx != -1)
365
- {
366
- U8* skipPtr = currPtr + (sizex << 2);
367
- U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
368
- do
369
- {
370
- if (b01 >= 0 && b02 >= 0 && b12 >= 0)
371
- atomicOr((U32*)currPtr, maskBit);
372
- currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
373
- if (currPtr == skipPtr)
374
- currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
375
- }
376
- while (currPtr != endPtr);
377
- }
378
- }
379
- }
380
- }
381
-
382
- __syncthreads();
383
-
384
- //------------------------------------------------------------------------
385
- // Count.
386
- //------------------------------------------------------------------------
387
-
388
- // Tile per thread: Initialize prefix sums.
389
-
390
- for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
391
- {
392
- int tileInBin = tileInBin_base + thrInBlock;
393
- bool act = (tileInBin < CR_BIN_SQR);
394
- U32 actMask = __ballot_sync(~0u, act);
395
- if (act)
396
- {
397
- // Compute prefix sum of emits over warps.
398
-
399
- U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
400
- U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
401
- int tileEmits = 0;
402
- for (int i = 0; i < CR_COARSE_WARPS; i++)
403
- {
404
- tileEmits += __popc(*(U32*)srcPtr);
405
- *(U32*)dstPtr = tileEmits;
406
- srcPtr += (CR_BIN_SQR + 1) * 4;
407
- dstPtr += (CR_BIN_SQR + 1) * 4;
408
- }
409
-
410
- // Determine the number of segments to allocate.
411
-
412
- int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
413
- int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
414
- volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
415
-
416
- // All counters within the warp are small => compute prefix sum using ballot.
417
-
418
- if (!__any_sync(actMask, tileEmits >= 2))
419
- {
420
- U32 m = getLaneMaskLe();
421
- *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
422
- }
423
-
424
- // Otherwise => scan-32 within the warp.
425
-
426
- else
427
- {
428
- U32 sum = (tileEmits << emitShift) | tileAllocs;
429
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
430
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
431
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
432
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
433
- *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
434
- *v = sum; __syncwarp(actMask);
435
- }
436
- }
437
- }
438
-
439
- // First warp: Scan-8.
440
-
441
- __syncthreads();
442
-
443
- bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
444
- U32 scan8Mask = __ballot_sync(~0u, scan8);
445
- if (scan8)
446
- {
447
- int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
448
- volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
449
- v[0] = sum; __syncwarp(scan8Mask);
450
- #if (CR_BIN_SQR > 1 * 32)
451
- sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
452
- #endif
453
- #if (CR_BIN_SQR > 2 * 32)
454
- sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
455
- #endif
456
- #if (CR_BIN_SQR > 4 * 32)
457
- sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
458
- #endif
459
- }
460
-
461
- __syncthreads();
462
-
463
- // Tile per thread: Finalize prefix sums.
464
- // Single thread: Allocate segments.
465
-
466
- for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
467
- {
468
- int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
469
- int numEmits = sum >> emitShift;
470
- int numAllocs = sum & ((1 << emitShift) - 1);
471
- s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
472
- s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
473
-
474
- if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
475
- {
476
- int t = atomicAdd(&atomics.numTileSegs, numAllocs);
477
- s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
478
- }
479
- }
480
-
481
- __syncthreads();
482
- int firstAllocSeg = s_firstAllocSeg;
483
- int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
484
- int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
485
-
486
- //------------------------------------------------------------------------
487
- // Emit.
488
- //------------------------------------------------------------------------
489
-
490
- // Emit per thread: Write triangle index to globalmem.
491
-
492
- for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
493
- {
494
- // Find tile in bin.
495
-
496
- U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
497
- U8* tilePtr = tileBase;
498
- U8* ptr;
499
-
500
- #if (CR_BIN_SQR > 128)
501
- ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
502
- #endif
503
- #if (CR_BIN_SQR > 64)
504
- ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
505
- #endif
506
- #if (CR_BIN_SQR > 32)
507
- ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
508
- #endif
509
- #if (CR_BIN_SQR > 16)
510
- ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
511
- #endif
512
- #if (CR_BIN_SQR > 8)
513
- ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
514
- #endif
515
- #if (CR_BIN_SQR > 4)
516
- ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
517
- #endif
518
- #if (CR_BIN_SQR > 2)
519
- ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
520
- #endif
521
- #if (CR_BIN_SQR > 1)
522
- ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
523
- #endif
524
-
525
- int tileInBin = (tilePtr - tileBase) >> 2;
526
- int emitInTile = emitInBin - *(U32*)tilePtr;
527
-
528
- // Find warp in tile.
529
-
530
- int warpStep = (CR_BIN_SQR + 1) * 4;
531
- U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
532
- U8* warpPtr = warpBase;
533
-
534
- #if (CR_COARSE_WARPS > 8)
535
- ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
536
- #endif
537
- #if (CR_COARSE_WARPS > 4)
538
- ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
539
- #endif
540
- #if (CR_COARSE_WARPS > 2)
541
- ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
542
- #endif
543
- #if (CR_COARSE_WARPS > 1)
544
- ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
545
- #endif
546
-
547
- int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
548
- U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
549
- int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
550
-
551
- // Find thread in warp.
552
-
553
- int threadInWarp = 0;
554
- int pop = __popc(emitMask & 0xFFFF);
555
- bool pred = (emitInWarp >= pop);
556
- if (pred) emitInWarp -= pop;
557
- if (pred) emitMask >>= 0x10;
558
- if (pred) threadInWarp += 0x10;
559
-
560
- pop = __popc(emitMask & 0xFF);
561
- pred = (emitInWarp >= pop);
562
- if (pred) emitInWarp -= pop;
563
- if (pred) emitMask >>= 0x08;
564
- if (pred) threadInWarp += 0x08;
565
-
566
- pop = __popc(emitMask & 0xF);
567
- pred = (emitInWarp >= pop);
568
- if (pred) emitInWarp -= pop;
569
- if (pred) emitMask >>= 0x04;
570
- if (pred) threadInWarp += 0x04;
571
-
572
- pop = __popc(emitMask & 0x3);
573
- pred = (emitInWarp >= pop);
574
- if (pred) emitInWarp -= pop;
575
- if (pred) emitMask >>= 0x02;
576
- if (pred) threadInWarp += 0x02;
577
-
578
- if (emitInWarp >= (emitMask & 1))
579
- threadInWarp++;
580
-
581
- // Figure out where to write.
582
-
583
- int currOfs = s_tileStreamCurrOfs[tileInBin];
584
- int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
585
- int outOfs = emitInTile;
586
-
587
- if (outOfs < spaceLeft)
588
- outOfs += currOfs;
589
- else
590
- {
591
- int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
592
- outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
593
- }
594
-
595
- // Write.
596
-
597
- int queueIdx = warpInTile * 32 + threadInWarp;
598
- int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
599
-
600
- tileSegData[outOfs] = triIdx;
601
- }
602
-
603
- //------------------------------------------------------------------------
604
- // Patch.
605
- //------------------------------------------------------------------------
606
-
607
- // Allocated segment per thread: Initialize next-pointer and count.
608
-
609
- for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
610
- {
611
- int segIdx = firstAllocSeg + i;
612
- tileSegNext[segIdx] = segIdx + 1;
613
- tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
614
- }
615
-
616
- // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
617
-
618
- __syncthreads();
619
- for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
620
- {
621
- int oldOfs = s_tileStreamCurrOfs[tileInBin];
622
- int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
623
- int allocLo = s_tileAllocPrefixSum[tileInBin];
624
- int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
625
-
626
- if (allocLo != allocHi)
627
- {
628
- S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
629
- if (oldOfs < 0)
630
- nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
631
- *nextPtr = firstAllocSeg + allocLo;
632
-
633
- newOfs--;
634
- newOfs &= CR_TILE_SEG_SIZE - 1;
635
- newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
636
- newOfs++;
637
- }
638
- s_tileStreamCurrOfs[tileInBin] = newOfs;
639
- }
640
-
641
- // Advance queue read pointer.
642
- // Queue became empty => bin done.
643
-
644
- triQueueReadPos += CR_COARSE_WARPS * 32;
645
- }
646
- while (triQueueReadPos < triQueueWritePos);
647
-
648
- // Tile per thread: Fix next-pointer and count of the last segment.
649
- // 32 tiles per warp: Count active tiles.
650
-
651
- __syncthreads();
652
-
653
- for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
654
- {
655
- int tileInBin = tileInBin_base + thrInBlock;
656
- bool act = (tileInBin < CR_BIN_SQR);
657
- U32 actMask = __ballot_sync(~0u, act);
658
- if (act)
659
- {
660
- int tileX = tileInBin & (CR_BIN_SIZE - 1);
661
- int tileY = tileInBin >> CR_BIN_LOG2;
662
- bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
663
-
664
- int ofs = s_tileStreamCurrOfs[tileInBin];
665
- int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
666
- int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
667
-
668
- if (ofs >= 0)
669
- tileSegNext[segIdx] = -1;
670
- else if (force)
671
- {
672
- s_tileStreamCurrOfs[tileInBin] = 0;
673
- tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
674
- }
675
-
676
- if (segCount != 0)
677
- tileSegCount[segIdx] = segCount;
678
-
679
- U32 res = __ballot_sync(actMask, ofs >= 0 | force);
680
- if (threadIdx.x == 0)
681
- s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
682
- }
683
- }
684
-
685
- // First warp: Scan-8.
686
- // One thread: Allocate space for active tiles.
687
-
688
- __syncthreads();
689
-
690
- bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
691
- U32 scan8Mask = __ballot_sync(~0u, scan8);
692
- if (scan8)
693
- {
694
- volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
695
- U32 sum = v[0];
696
- #if (CR_BIN_SQR > 1 * 32)
697
- sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
698
- #endif
699
- #if (CR_BIN_SQR > 2 * 32)
700
- sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
701
- #endif
702
- #if (CR_BIN_SQR > 4 * 32)
703
- sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
704
- #endif
705
-
706
- if (thrInBlock == CR_BIN_SQR / 32 - 1)
707
- s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
708
- }
709
-
710
- // Tile per thread: Output active tiles.
711
-
712
- __syncthreads();
713
-
714
- for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
715
- {
716
- int tileInBin = tileInBin_base + thrInBlock;
717
- bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
718
- U32 actMask = __ballot_sync(~0u, act);
719
- if (act)
720
- {
721
- int activeIdx = s_firstActiveIdx;
722
- activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
723
- activeIdx += __popc(actMask & getLaneMaskLt());
724
- activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
725
- }
726
- }
727
- }
728
- }
729
-
730
- //------------------------------------------------------------------------