Delete ginipick:SORA-3D
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- ginipick:SORA-3D/3d.mp4 +0 -3
- ginipick:SORA-3D/README.md +0 -11
- ginipick:SORA-3D/app.py +0 -2
- ginipick:SORA-3D/assets/.DS_Store +0 -0
- ginipick:SORA-3D/assets/example_image/.DS_Store +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp +0 -3
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp +0 -3
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp +0 -0
- ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp +0 -0
- ginipick:SORA-3D/extensions/.DS_Store +0 -0
- ginipick:SORA-3D/extensions/extensions_nvdiffrast_LICENSE.txt +0 -97
- ginipick:SORA-3D/extensions/extensions_nvdiffrast_README.md +0 -42
- ginipick:SORA-3D/extensions/extensions_nvdiffrast_run_sample.sh +0 -52
- ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup copy.py +0 -51
- ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup.py +0 -82
- ginipick:SORA-3D/extensions/nvdiffrast/.DS_Store +0 -0
- ginipick:SORA-3D/extensions/nvdiffrast/common/.DS_Store +0 -0
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/.DS_Store +0 -0
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp +0 -63
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/.DS_Store +0 -0
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl +0 -423
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp +0 -94
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp +0 -55
- ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl +0 -730
ginipick:SORA-3D/3d.mp4
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6c3282465210bac76f44b605956139679ed774c8bad9be686707d1b770961371
|
3 |
-
size 21309978
|
|
|
|
|
|
|
|
ginipick:SORA-3D/README.md
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: SORA 3D
|
3 |
-
emoji: 🏢🏆
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.44.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
short_description: Create top-quality 3D(.GLB) models from text or images
|
11 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/app.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
exec(os.environ.get('APP'))
|
|
|
|
|
|
ginipick:SORA-3D/assets/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
ginipick:SORA-3D/assets/example_image/.DS_Store
DELETED
Binary file (12.3 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp
DELETED
Binary file (93 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp
DELETED
Binary file (48.5 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp
DELETED
Binary file (63.9 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp
DELETED
Binary file (20.7 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp
DELETED
Binary file (42.9 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp
DELETED
Binary file (32.6 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp
DELETED
Binary file (41.9 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp
DELETED
Binary file (42.4 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp
DELETED
Binary file (14 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp
DELETED
Binary file (51 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp
DELETED
Binary file (33.3 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp
DELETED
Binary file (56.1 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp
DELETED
Binary file (58.6 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp
DELETED
Binary file (11 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp
DELETED
Binary file (58.5 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp
DELETED
Binary file (47.6 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp
DELETED
Binary file (24.7 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp
DELETED
Binary file (38.7 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp
DELETED
Binary file (49.6 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp
DELETED
Binary file (46.2 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp
DELETED
Binary file (63.1 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp
DELETED
Git LFS Details
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp
DELETED
Binary file (33.4 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp
DELETED
Binary file (95 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp
DELETED
Git LFS Details
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp
DELETED
Binary file (57.5 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp
DELETED
Binary file (88.8 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp
DELETED
Binary file (19 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp
DELETED
Binary file (12.6 kB)
|
|
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp
DELETED
Binary file (8.18 kB)
|
|
ginipick:SORA-3D/extensions/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
ginipick:SORA-3D/extensions/extensions_nvdiffrast_LICENSE.txt
DELETED
@@ -1,97 +0,0 @@
|
|
1 |
-
Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
|
2 |
-
|
3 |
-
|
4 |
-
Nvidia Source Code License (1-Way Commercial)
|
5 |
-
|
6 |
-
=======================================================================
|
7 |
-
|
8 |
-
1. Definitions
|
9 |
-
|
10 |
-
"Licensor" means any person or entity that distributes its Work.
|
11 |
-
|
12 |
-
"Software" means the original work of authorship made available under
|
13 |
-
this License.
|
14 |
-
|
15 |
-
"Work" means the Software and any additions to or derivative works of
|
16 |
-
the Software that are made available under this License.
|
17 |
-
|
18 |
-
The terms "reproduce," "reproduction," "derivative works," and
|
19 |
-
"distribution" have the meaning as provided under U.S. copyright law;
|
20 |
-
provided, however, that for the purposes of this License, derivative
|
21 |
-
works shall not include works that remain separable from, or merely
|
22 |
-
link (or bind by name) to the interfaces of, the Work.
|
23 |
-
|
24 |
-
Works, including the Software, are "made available" under this License
|
25 |
-
by including in or with the Work either (a) a copyright notice
|
26 |
-
referencing the applicability of this License to the Work, or (b) a
|
27 |
-
copy of this License.
|
28 |
-
|
29 |
-
2. License Grants
|
30 |
-
|
31 |
-
2.1 Copyright Grant. Subject to the terms and conditions of this
|
32 |
-
License, each Licensor grants to you a perpetual, worldwide,
|
33 |
-
non-exclusive, royalty-free, copyright license to reproduce,
|
34 |
-
prepare derivative works of, publicly display, publicly perform,
|
35 |
-
sublicense and distribute its Work and any resulting derivative
|
36 |
-
works in any form.
|
37 |
-
|
38 |
-
3. Limitations
|
39 |
-
|
40 |
-
3.1 Redistribution. You may reproduce or distribute the Work only
|
41 |
-
if (a) you do so under this License, (b) you include a complete
|
42 |
-
copy of this License with your distribution, and (c) you retain
|
43 |
-
without modification any copyright, patent, trademark, or
|
44 |
-
attribution notices that are present in the Work.
|
45 |
-
|
46 |
-
3.2 Derivative Works. You may specify that additional or different
|
47 |
-
terms apply to the use, reproduction, and distribution of your
|
48 |
-
derivative works of the Work ("Your Terms") only if (a) Your Terms
|
49 |
-
provide that the use limitation in Section 3.3 applies to your
|
50 |
-
derivative works, and (b) you identify the specific derivative
|
51 |
-
works that are subject to Your Terms. Notwithstanding Your Terms,
|
52 |
-
this License (including the redistribution requirements in Section
|
53 |
-
3.1) will continue to apply to the Work itself.
|
54 |
-
|
55 |
-
3.3 Use Limitation. The Work and any derivative works thereof only
|
56 |
-
may be used or intended for use non-commercially. The Work or
|
57 |
-
derivative works thereof may be used or intended for use by Nvidia
|
58 |
-
or its affiliates commercially or non-commercially. As used herein,
|
59 |
-
"non-commercially" means for research or evaluation purposes only
|
60 |
-
and not for any direct or indirect monetary gain.
|
61 |
-
|
62 |
-
3.4 Patent Claims. If you bring or threaten to bring a patent claim
|
63 |
-
against any Licensor (including any claim, cross-claim or
|
64 |
-
counterclaim in a lawsuit) to enforce any patents that you allege
|
65 |
-
are infringed by any Work, then your rights under this License from
|
66 |
-
such Licensor (including the grant in Section 2.1) will terminate
|
67 |
-
immediately.
|
68 |
-
|
69 |
-
3.5 Trademarks. This License does not grant any rights to use any
|
70 |
-
Licensor's or its affiliates' names, logos, or trademarks, except
|
71 |
-
as necessary to reproduce the notices described in this License.
|
72 |
-
|
73 |
-
3.6 Termination. If you violate any term of this License, then your
|
74 |
-
rights under this License (including the grant in Section 2.1) will
|
75 |
-
terminate immediately.
|
76 |
-
|
77 |
-
4. Disclaimer of Warranty.
|
78 |
-
|
79 |
-
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
80 |
-
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
|
81 |
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
|
82 |
-
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
|
83 |
-
THIS LICENSE.
|
84 |
-
|
85 |
-
5. Limitation of Liability.
|
86 |
-
|
87 |
-
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
|
88 |
-
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
|
89 |
-
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
|
90 |
-
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
|
91 |
-
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
|
92 |
-
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
|
93 |
-
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
|
94 |
-
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
|
95 |
-
THE POSSIBILITY OF SUCH DAMAGES.
|
96 |
-
|
97 |
-
=======================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/extensions_nvdiffrast_README.md
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
## Nvdiffrast – Modular Primitives for High-Performance Differentiable Rendering
|
2 |
-
|
3 |
-

|
4 |
-
|
5 |
-
**Modular Primitives for High-Performance Differentiable Rendering**<br>
|
6 |
-
Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
|
7 |
-
[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
|
8 |
-
|
9 |
-
Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
|
10 |
-
Please refer to ☞☞ [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) ☜☜ for more information.
|
11 |
-
|
12 |
-
## Licenses
|
13 |
-
|
14 |
-
Copyright © 2020–2024, NVIDIA Corporation. All rights reserved.
|
15 |
-
|
16 |
-
This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
|
17 |
-
|
18 |
-
For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
|
19 |
-
|
20 |
-
We do not currently accept outside code contributions in the form of pull requests.
|
21 |
-
|
22 |
-
Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
|
23 |
-
[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
|
24 |
-
originally shared under
|
25 |
-
[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
|
26 |
-
Mesh and texture stored as part of `samples/data/earth.npz` are derived from
|
27 |
-
[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
|
28 |
-
model originally made available under
|
29 |
-
[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
|
30 |
-
|
31 |
-
## Citation
|
32 |
-
|
33 |
-
```
|
34 |
-
@article{Laine2020diffrast,
|
35 |
-
title = {Modular Primitives for High-Performance Differentiable Rendering},
|
36 |
-
author = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
|
37 |
-
journal = {ACM Transactions on Graphics},
|
38 |
-
year = {2020},
|
39 |
-
volume = {39},
|
40 |
-
number = {6}
|
41 |
-
}
|
42 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/extensions_nvdiffrast_run_sample.sh
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
|
3 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
4 |
-
#
|
5 |
-
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
6 |
-
# and proprietary rights in and to this software, related documentation
|
7 |
-
# and any modifications thereto. Any use, reproduction, disclosure or
|
8 |
-
# distribution of this software and related documentation without an express
|
9 |
-
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
10 |
-
|
11 |
-
function print_help {
|
12 |
-
echo "Usage: `basename $0` [--build-container] <python_file>"
|
13 |
-
echo ""
|
14 |
-
echo "Option --build-container will build the Docker container based on"
|
15 |
-
echo "docker/Dockerfile and tag the image with gltorch:latest."
|
16 |
-
echo ""
|
17 |
-
echo "Example: `basename $0` samples/torch/envphong.py"
|
18 |
-
}
|
19 |
-
|
20 |
-
build_container=0
|
21 |
-
sample=""
|
22 |
-
while [[ "$#" -gt 0 ]]; do
|
23 |
-
case $1 in
|
24 |
-
--build-container) build_container=1;;
|
25 |
-
-h|--help) print_help; exit 0 ;;
|
26 |
-
--*) echo "Unknown parameter passed: $1"; exit 1 ;;
|
27 |
-
*) sample="$1"; shift; break;
|
28 |
-
esac
|
29 |
-
shift
|
30 |
-
done
|
31 |
-
|
32 |
-
rest=$@
|
33 |
-
|
34 |
-
# Build the docker container
|
35 |
-
if [ "$build_container" = "1" ]; then
|
36 |
-
docker build --tag gltorch:latest -f docker/Dockerfile .
|
37 |
-
fi
|
38 |
-
|
39 |
-
if [ ! -f "$sample" ]; then
|
40 |
-
echo
|
41 |
-
echo "No python sample given or file '$sample' not found. Exiting."
|
42 |
-
exit 1
|
43 |
-
fi
|
44 |
-
|
45 |
-
image="gltorch:latest"
|
46 |
-
|
47 |
-
echo "Using container image: $image"
|
48 |
-
echo "Running command: $sample $rest"
|
49 |
-
|
50 |
-
# Run a sample with docker
|
51 |
-
docker run --rm -it --gpus all --user $(id -u):$(id -g) \
|
52 |
-
-v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup copy.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
#
|
3 |
-
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
# and proprietary rights in and to this software, related documentation
|
5 |
-
# and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
# distribution of this software and related documentation without an express
|
7 |
-
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
import nvdiffrast
|
10 |
-
import setuptools
|
11 |
-
import os
|
12 |
-
|
13 |
-
with open("README.md", "r") as fh:
|
14 |
-
long_description = fh.read()
|
15 |
-
|
16 |
-
setuptools.setup(
|
17 |
-
name="nvdiffrast",
|
18 |
-
version=nvdiffrast.__version__,
|
19 |
-
author="Samuli Laine",
|
20 |
-
author_email="slaine@nvidia.com",
|
21 |
-
description="nvdiffrast - modular primitives for high-performance differentiable rendering",
|
22 |
-
long_description=long_description,
|
23 |
-
long_description_content_type="text/markdown",
|
24 |
-
url="https://github.com/NVlabs/nvdiffrast",
|
25 |
-
packages=setuptools.find_packages(),
|
26 |
-
package_data={
|
27 |
-
'nvdiffrast': [
|
28 |
-
'common/*.h',
|
29 |
-
'common/*.inl',
|
30 |
-
'common/*.cu',
|
31 |
-
'common/*.cpp',
|
32 |
-
'common/cudaraster/*.hpp',
|
33 |
-
'common/cudaraster/impl/*.cpp',
|
34 |
-
'common/cudaraster/impl/*.hpp',
|
35 |
-
'common/cudaraster/impl/*.inl',
|
36 |
-
'common/cudaraster/impl/*.cu',
|
37 |
-
'lib/*.h',
|
38 |
-
'torch/*.h',
|
39 |
-
'torch/*.inl',
|
40 |
-
'torch/*.cpp',
|
41 |
-
'tensorflow/*.cu',
|
42 |
-
] + (['lib/*.lib'] if os.name == 'nt' else [])
|
43 |
-
},
|
44 |
-
include_package_data=True,
|
45 |
-
install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
|
46 |
-
classifiers=[
|
47 |
-
"Programming Language :: Python :: 3",
|
48 |
-
"Operating System :: OS Independent",
|
49 |
-
],
|
50 |
-
python_requires='>=3.6',
|
51 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup.py
DELETED
@@ -1,82 +0,0 @@
|
|
1 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
#
|
3 |
-
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
# and proprietary rights in and to this software, related documentation
|
5 |
-
# and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
# distribution of this software and related documentation without an express
|
7 |
-
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
import nvdiffrast
|
10 |
-
import setuptools
|
11 |
-
import os
|
12 |
-
from torch.utils.cpp_extension import CUDAExtension, BuildExtension
|
13 |
-
|
14 |
-
|
15 |
-
with open("README.md", "r") as fh:
|
16 |
-
long_description = fh.read()
|
17 |
-
|
18 |
-
setuptools.setup(
|
19 |
-
name="nvdiffrast",
|
20 |
-
version=nvdiffrast.__version__,
|
21 |
-
author="Samuli Laine",
|
22 |
-
author_email="slaine@nvidia.com",
|
23 |
-
description="nvdiffrast - modular primitives for high-performance differentiable rendering",
|
24 |
-
long_description=long_description,
|
25 |
-
long_description_content_type="text/markdown",
|
26 |
-
url="https://github.com/NVlabs/nvdiffrast",
|
27 |
-
packages=setuptools.find_packages(),
|
28 |
-
# package_data={
|
29 |
-
# 'nvdiffrast': [
|
30 |
-
# 'common/*.h',
|
31 |
-
# 'common/*.inl',
|
32 |
-
# 'common/*.cu',
|
33 |
-
# 'common/*.cpp',
|
34 |
-
# 'common/cudaraster/*.hpp',
|
35 |
-
# 'common/cudaraster/impl/*.cpp',
|
36 |
-
# 'common/cudaraster/impl/*.hpp',
|
37 |
-
# 'common/cudaraster/impl/*.inl',
|
38 |
-
# 'common/cudaraster/impl/*.cu',
|
39 |
-
# 'lib/*.h',
|
40 |
-
# 'torch/*.h',
|
41 |
-
# 'torch/*.inl',
|
42 |
-
# 'torch/*.cpp',
|
43 |
-
# 'tensorflow/*.cu',
|
44 |
-
# ] + (['lib/*.lib'] if os.name == 'nt' else [])
|
45 |
-
# },
|
46 |
-
# include_package_data=True,
|
47 |
-
ext_modules=[
|
48 |
-
CUDAExtension(
|
49 |
-
name="nvdiffrast.torch._C",
|
50 |
-
sources=[
|
51 |
-
'nvdiffrast/common/cudaraster/impl/Buffer.cpp',
|
52 |
-
'nvdiffrast/common/cudaraster/impl/CudaRaster.cpp',
|
53 |
-
'nvdiffrast/common/cudaraster/impl/RasterImpl_.cu',
|
54 |
-
'nvdiffrast/common/cudaraster/impl/RasterImpl.cpp',
|
55 |
-
'nvdiffrast/common/common.cpp',
|
56 |
-
'nvdiffrast/common/rasterize.cu',
|
57 |
-
'nvdiffrast/common/interpolate.cu',
|
58 |
-
'nvdiffrast/common/texture_.cu',
|
59 |
-
'nvdiffrast/common/texture.cpp',
|
60 |
-
'nvdiffrast/common/antialias.cu',
|
61 |
-
'nvdiffrast/torch/torch_bindings.cpp',
|
62 |
-
'nvdiffrast/torch/torch_rasterize.cpp',
|
63 |
-
'nvdiffrast/torch/torch_interpolate.cpp',
|
64 |
-
'nvdiffrast/torch/torch_texture.cpp',
|
65 |
-
'nvdiffrast/torch/torch_antialias.cpp',
|
66 |
-
],
|
67 |
-
extra_compile_args={
|
68 |
-
'cxx': ['-DNVDR_TORCH'],
|
69 |
-
'nvcc': ['-DNVDR_TORCH', '-lineinfo'],
|
70 |
-
},
|
71 |
-
)
|
72 |
-
],
|
73 |
-
cmdclass={
|
74 |
-
'build_ext': BuildExtension
|
75 |
-
},
|
76 |
-
install_requires=['numpy'], # note: can't require torch here as it will install torch even for a TensorFlow container
|
77 |
-
classifiers=[
|
78 |
-
"Programming Language :: Python :: 3",
|
79 |
-
"Operating System :: OS Independent",
|
80 |
-
],
|
81 |
-
python_requires='>=3.6',
|
82 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/nvdiffrast/.DS_Store
DELETED
Binary file (8.2 kB)
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/.DS_Store
DELETED
Binary file (10.2 kB)
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp
DELETED
@@ -1,63 +0,0 @@
|
|
1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
//
|
3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
// and proprietary rights in and to this software, related documentation
|
5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
// distribution of this software and related documentation without an express
|
7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
#pragma once
|
10 |
-
|
11 |
-
//------------------------------------------------------------------------
|
12 |
-
// This is a slimmed-down and modernized version of the original
|
13 |
-
// CudaRaster codebase that accompanied the HPG 2011 paper
|
14 |
-
// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
|
15 |
-
// Modifications have been made to accommodate post-Volta execution model
|
16 |
-
// with warp divergence. Support for shading, blending, quad rendering,
|
17 |
-
// and supersampling have been removed as unnecessary for nvdiffrast.
|
18 |
-
//------------------------------------------------------------------------
|
19 |
-
|
20 |
-
namespace CR
|
21 |
-
{
|
22 |
-
|
23 |
-
class RasterImpl;
|
24 |
-
|
25 |
-
//------------------------------------------------------------------------
|
26 |
-
// Interface class to isolate user from implementation details.
|
27 |
-
//------------------------------------------------------------------------
|
28 |
-
|
29 |
-
class CudaRaster
|
30 |
-
{
|
31 |
-
public:
|
32 |
-
enum
|
33 |
-
{
|
34 |
-
RenderModeFlag_EnableBackfaceCulling = 1 << 0, // Enable backface culling.
|
35 |
-
RenderModeFlag_EnableDepthPeeling = 1 << 1, // Enable depth peeling. Must have a peel buffer set.
|
36 |
-
};
|
37 |
-
|
38 |
-
public:
|
39 |
-
CudaRaster (void);
|
40 |
-
~CudaRaster (void);
|
41 |
-
|
42 |
-
void setBufferSize (int width, int height, int numImages); // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
|
43 |
-
void setViewport (int width, int height, int offsetX, int offsetY); // Tiled rendering viewport setup.
|
44 |
-
void setRenderModeFlags (unsigned int renderModeFlags); // Affects all subsequent calls to drawTriangles(). Defaults to zero.
|
45 |
-
void deferredClear (unsigned int clearColor); // Clears color and depth buffers during next call to drawTriangles().
|
46 |
-
void setVertexBuffer (void* vertices, int numVertices); // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
|
47 |
-
void setIndexBuffer (void* indices, int numTriangles); // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
|
48 |
-
bool drawTriangles (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
|
49 |
-
void* getColorBuffer (void); // GPU pointer managed by CudaRaster.
|
50 |
-
void* getDepthBuffer (void); // GPU pointer managed by CudaRaster.
|
51 |
-
void swapDepthAndPeel (void); // Swap depth and peeling buffers.
|
52 |
-
|
53 |
-
private:
|
54 |
-
CudaRaster (const CudaRaster&); // forbidden
|
55 |
-
CudaRaster& operator= (const CudaRaster&); // forbidden
|
56 |
-
|
57 |
-
private:
|
58 |
-
RasterImpl* m_impl; // Opaque pointer to implementation.
|
59 |
-
};
|
60 |
-
|
61 |
-
//------------------------------------------------------------------------
|
62 |
-
} // namespace CR
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/.DS_Store
DELETED
Binary file (10.2 kB)
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl
DELETED
@@ -1,423 +0,0 @@
|
|
1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
//
|
3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
// and proprietary rights in and to this software, related documentation
|
5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
// distribution of this software and related documentation without an express
|
7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
//------------------------------------------------------------------------
|
10 |
-
|
11 |
-
__device__ __inline__ void binRasterImpl(const CRParams p)
|
12 |
-
{
|
13 |
-
__shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
|
14 |
-
__shared__ volatile S32 s_outOfs [CR_MAXBINS_SQR];
|
15 |
-
__shared__ volatile S32 s_outTotal [CR_MAXBINS_SQR];
|
16 |
-
__shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
|
17 |
-
__shared__ volatile S32 s_outMask [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
|
18 |
-
__shared__ volatile S32 s_outCount [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
|
19 |
-
__shared__ volatile S32 s_triBuf [CR_BIN_WARPS*32*4]; // triangle ring buffer
|
20 |
-
__shared__ volatile U32 s_batchPos;
|
21 |
-
__shared__ volatile U32 s_bufCount;
|
22 |
-
__shared__ volatile U32 s_overTotal;
|
23 |
-
__shared__ volatile U32 s_allocBase;
|
24 |
-
|
25 |
-
const CRImageParams& ip = getImageParams(p, blockIdx.z);
|
26 |
-
CRAtomics& atomics = p.atomics[blockIdx.z];
|
27 |
-
const U8* triSubtris = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
|
28 |
-
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
|
29 |
-
|
30 |
-
S32* binFirstSeg = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
31 |
-
S32* binTotal = (S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
32 |
-
S32* binSegData = (S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
|
33 |
-
S32* binSegNext = (S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
|
34 |
-
S32* binSegCount = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
|
35 |
-
|
36 |
-
if (atomics.numSubtris > p.maxSubtris)
|
37 |
-
return;
|
38 |
-
|
39 |
-
// per-thread state
|
40 |
-
int thrInBlock = threadIdx.x + threadIdx.y * 32;
|
41 |
-
int batchPos = 0;
|
42 |
-
|
43 |
-
// first 16 elements of s_broadcast are always zero
|
44 |
-
if (thrInBlock < 16)
|
45 |
-
s_broadcast[thrInBlock] = 0;
|
46 |
-
|
47 |
-
// initialize output linked lists and offsets
|
48 |
-
if (thrInBlock < p.numBins)
|
49 |
-
{
|
50 |
-
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
|
51 |
-
s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
|
52 |
-
s_outTotal[thrInBlock] = 0;
|
53 |
-
}
|
54 |
-
|
55 |
-
// repeat until done
|
56 |
-
for(;;)
|
57 |
-
{
|
58 |
-
// get batch
|
59 |
-
if (thrInBlock == 0)
|
60 |
-
s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
|
61 |
-
__syncthreads();
|
62 |
-
batchPos = s_batchPos;
|
63 |
-
|
64 |
-
// all batches done?
|
65 |
-
if (batchPos >= ip.triCount)
|
66 |
-
break;
|
67 |
-
|
68 |
-
// per-thread state
|
69 |
-
int bufIndex = 0;
|
70 |
-
int bufCount = 0;
|
71 |
-
int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
|
72 |
-
|
73 |
-
// loop over batch as long as we have triangles in it
|
74 |
-
do
|
75 |
-
{
|
76 |
-
// read more triangles
|
77 |
-
while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
|
78 |
-
{
|
79 |
-
// get subtriangle count
|
80 |
-
|
81 |
-
int triIdx = batchPos + thrInBlock;
|
82 |
-
int num = 0;
|
83 |
-
if (triIdx < batchEnd)
|
84 |
-
num = triSubtris[triIdx];
|
85 |
-
|
86 |
-
// cumulative sum of subtriangles within each warp
|
87 |
-
U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
|
88 |
-
if (__any_sync(~0u, num > 1))
|
89 |
-
{
|
90 |
-
myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
|
91 |
-
myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
|
92 |
-
}
|
93 |
-
if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
|
94 |
-
s_broadcast[threadIdx.y + 16] = myIdx + num;
|
95 |
-
__syncthreads();
|
96 |
-
|
97 |
-
// cumulative sum of per-warp subtriangle counts
|
98 |
-
// Note: cannot have more than 32 warps or this needs to sync between each step.
|
99 |
-
bool act = (thrInBlock < CR_BIN_WARPS);
|
100 |
-
U32 actMask = __ballot_sync(~0u, act);
|
101 |
-
if (threadIdx.y == 0 && act)
|
102 |
-
{
|
103 |
-
volatile U32* ptr = &s_broadcast[thrInBlock + 16];
|
104 |
-
U32 val = *ptr;
|
105 |
-
#if (CR_BIN_WARPS > 1)
|
106 |
-
val += ptr[-1]; __syncwarp(actMask);
|
107 |
-
*ptr = val; __syncwarp(actMask);
|
108 |
-
#endif
|
109 |
-
#if (CR_BIN_WARPS > 2)
|
110 |
-
val += ptr[-2]; __syncwarp(actMask);
|
111 |
-
*ptr = val; __syncwarp(actMask);
|
112 |
-
#endif
|
113 |
-
#if (CR_BIN_WARPS > 4)
|
114 |
-
val += ptr[-4]; __syncwarp(actMask);
|
115 |
-
*ptr = val; __syncwarp(actMask);
|
116 |
-
#endif
|
117 |
-
#if (CR_BIN_WARPS > 8)
|
118 |
-
val += ptr[-8]; __syncwarp(actMask);
|
119 |
-
*ptr = val; __syncwarp(actMask);
|
120 |
-
#endif
|
121 |
-
#if (CR_BIN_WARPS > 16)
|
122 |
-
val += ptr[-16]; __syncwarp(actMask);
|
123 |
-
*ptr = val; __syncwarp(actMask);
|
124 |
-
#endif
|
125 |
-
|
126 |
-
// initially assume that we consume everything
|
127 |
-
// only last active thread does the writes
|
128 |
-
if (threadIdx.x == CR_BIN_WARPS - 1)
|
129 |
-
{
|
130 |
-
s_batchPos = batchPos + CR_BIN_WARPS * 32;
|
131 |
-
s_bufCount = bufCount + val;
|
132 |
-
}
|
133 |
-
}
|
134 |
-
__syncthreads();
|
135 |
-
|
136 |
-
// skip if no subtriangles
|
137 |
-
if (num)
|
138 |
-
{
|
139 |
-
// calculate write position for first subtriangle
|
140 |
-
U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
|
141 |
-
|
142 |
-
// only write if entire triangle fits
|
143 |
-
if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
|
144 |
-
{
|
145 |
-
pos += bufIndex; // adjust for current start position
|
146 |
-
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
147 |
-
if (num == 1)
|
148 |
-
s_triBuf[pos] = triIdx * 8 + 7; // single triangle
|
149 |
-
else
|
150 |
-
{
|
151 |
-
for (int i=0; i < num; i++)
|
152 |
-
{
|
153 |
-
s_triBuf[pos] = triIdx * 8 + i;
|
154 |
-
pos++;
|
155 |
-
pos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
156 |
-
}
|
157 |
-
}
|
158 |
-
} else if (pos <= CR_ARRAY_SIZE(s_triBuf))
|
159 |
-
{
|
160 |
-
// this triangle is the first that failed, overwrite total count and triangle count
|
161 |
-
s_batchPos = batchPos + thrInBlock;
|
162 |
-
s_bufCount = pos;
|
163 |
-
}
|
164 |
-
}
|
165 |
-
|
166 |
-
// update triangle counts
|
167 |
-
__syncthreads();
|
168 |
-
batchPos = s_batchPos;
|
169 |
-
bufCount = s_bufCount;
|
170 |
-
}
|
171 |
-
|
172 |
-
// make every warp clear its output buffers
|
173 |
-
for (int i=threadIdx.x; i < p.numBins; i += 32)
|
174 |
-
s_outMask[threadIdx.y][i] = 0;
|
175 |
-
__syncwarp();
|
176 |
-
|
177 |
-
// choose our triangle
|
178 |
-
uint4 triData = make_uint4(0, 0, 0, 0);
|
179 |
-
if (thrInBlock < bufCount)
|
180 |
-
{
|
181 |
-
U32 triPos = bufIndex + thrInBlock;
|
182 |
-
triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
|
183 |
-
|
184 |
-
// find triangle
|
185 |
-
int triIdx = s_triBuf[triPos];
|
186 |
-
int dataIdx = triIdx >> 3;
|
187 |
-
int subtriIdx = triIdx & 7;
|
188 |
-
if (subtriIdx != 7)
|
189 |
-
dataIdx = triHeader[dataIdx].misc + subtriIdx;
|
190 |
-
|
191 |
-
// read triangle
|
192 |
-
|
193 |
-
triData = *(((const uint4*)triHeader) + dataIdx);
|
194 |
-
}
|
195 |
-
|
196 |
-
// setup bounding box and edge functions, and rasterize
|
197 |
-
S32 lox, loy, hix, hiy;
|
198 |
-
bool hasTri = (thrInBlock < bufCount);
|
199 |
-
U32 hasTriMask = __ballot_sync(~0u, hasTri);
|
200 |
-
if (hasTri)
|
201 |
-
{
|
202 |
-
S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
|
203 |
-
S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
|
204 |
-
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
|
205 |
-
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
|
206 |
-
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
|
207 |
-
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
|
208 |
-
int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
|
209 |
-
lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
|
210 |
-
loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
|
211 |
-
hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins - 1);
|
212 |
-
hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
|
213 |
-
|
214 |
-
U32 bit = 1 << threadIdx.x;
|
215 |
-
#if __CUDA_ARCH__ >= 700
|
216 |
-
bool multi = (hix != lox || hiy != loy);
|
217 |
-
if (!__any_sync(hasTriMask, multi))
|
218 |
-
{
|
219 |
-
int binIdx = lox + p.widthBins * loy;
|
220 |
-
U32 mask = __match_any_sync(hasTriMask, binIdx);
|
221 |
-
s_outMask[threadIdx.y][binIdx] = mask;
|
222 |
-
__syncwarp(hasTriMask);
|
223 |
-
} else
|
224 |
-
#endif
|
225 |
-
{
|
226 |
-
bool complex = (hix > lox+1 || hiy > loy+1);
|
227 |
-
if (!__any_sync(hasTriMask, complex))
|
228 |
-
{
|
229 |
-
int binIdx = lox + p.widthBins * loy;
|
230 |
-
atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
|
231 |
-
if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
|
232 |
-
if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
|
233 |
-
if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
|
234 |
-
} else
|
235 |
-
{
|
236 |
-
S32 d12x = d02x - d01x, d12y = d02y - d01y;
|
237 |
-
v0x -= lox << binLog, v0y -= loy << binLog;
|
238 |
-
|
239 |
-
S32 t01 = v0x * d01y - v0y * d01x;
|
240 |
-
S32 t02 = v0y * d02x - v0x * d02y;
|
241 |
-
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
|
242 |
-
S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
|
243 |
-
S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
|
244 |
-
S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
|
245 |
-
|
246 |
-
int width = hix - lox + 1;
|
247 |
-
d01x += width * d01y;
|
248 |
-
d02x += width * d02y;
|
249 |
-
d12x += width * d12y;
|
250 |
-
|
251 |
-
U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
|
252 |
-
U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
|
253 |
-
U8* endPtr = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
|
254 |
-
int stride = p.widthBins * 4;
|
255 |
-
int ptrYInc = stride - width * 4;
|
256 |
-
|
257 |
-
do
|
258 |
-
{
|
259 |
-
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
|
260 |
-
atomicOr((U32*)currPtr, bit);
|
261 |
-
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
262 |
-
if (currPtr == skipPtr)
|
263 |
-
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
|
264 |
-
}
|
265 |
-
while (currPtr != endPtr);
|
266 |
-
}
|
267 |
-
}
|
268 |
-
}
|
269 |
-
|
270 |
-
// count per-bin contributions
|
271 |
-
if (thrInBlock == 0)
|
272 |
-
s_overTotal = 0; // overflow counter
|
273 |
-
|
274 |
-
// ensure that out masks are done
|
275 |
-
__syncthreads();
|
276 |
-
|
277 |
-
int overIndex = -1;
|
278 |
-
bool act = (thrInBlock < p.numBins);
|
279 |
-
U32 actMask = __ballot_sync(~0u, act);
|
280 |
-
if (act)
|
281 |
-
{
|
282 |
-
U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
|
283 |
-
U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
|
284 |
-
int total = 0;
|
285 |
-
for (int i = 0; i < CR_BIN_WARPS; i++)
|
286 |
-
{
|
287 |
-
total += __popc(*(U32*)srcPtr);
|
288 |
-
*(U32*)dstPtr = total;
|
289 |
-
srcPtr += (CR_MAXBINS_SQR + 1) * 4;
|
290 |
-
dstPtr += (CR_MAXBINS_SQR + 1) * 4;
|
291 |
-
}
|
292 |
-
|
293 |
-
// overflow => request a new segment
|
294 |
-
int ofs = s_outOfs[thrInBlock];
|
295 |
-
bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
|
296 |
-
U32 ovrMask = __ballot_sync(actMask, ovr);
|
297 |
-
if (ovr)
|
298 |
-
{
|
299 |
-
overIndex = __popc(ovrMask & getLaneMaskLt());
|
300 |
-
if (overIndex == 0)
|
301 |
-
s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
|
302 |
-
__syncwarp(ovrMask);
|
303 |
-
overIndex += s_broadcast[threadIdx.y + 16];
|
304 |
-
s_overIndex[thrInBlock] = overIndex;
|
305 |
-
}
|
306 |
-
}
|
307 |
-
|
308 |
-
// sync after overTotal is ready
|
309 |
-
__syncthreads();
|
310 |
-
|
311 |
-
// at least one segment overflowed => allocate segments
|
312 |
-
U32 overTotal = s_overTotal;
|
313 |
-
U32 allocBase = 0;
|
314 |
-
if (overTotal > 0)
|
315 |
-
{
|
316 |
-
// allocate memory
|
317 |
-
if (thrInBlock == 0)
|
318 |
-
{
|
319 |
-
U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
|
320 |
-
s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
|
321 |
-
}
|
322 |
-
__syncthreads();
|
323 |
-
allocBase = s_allocBase;
|
324 |
-
|
325 |
-
// did my bin overflow?
|
326 |
-
if (overIndex != -1)
|
327 |
-
{
|
328 |
-
// calculate new segment index
|
329 |
-
int segIdx = allocBase + overIndex;
|
330 |
-
|
331 |
-
// add to linked list
|
332 |
-
if (s_outOfs[thrInBlock] < 0)
|
333 |
-
binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
|
334 |
-
else
|
335 |
-
binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
|
336 |
-
|
337 |
-
// defaults
|
338 |
-
binSegNext [segIdx] = -1;
|
339 |
-
binSegCount[segIdx] = CR_BIN_SEG_SIZE;
|
340 |
-
}
|
341 |
-
}
|
342 |
-
|
343 |
-
// concurrent emission -- each warp handles its own triangle
|
344 |
-
if (thrInBlock < bufCount)
|
345 |
-
{
|
346 |
-
int triPos = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
|
347 |
-
int currBin = lox + loy * p.widthBins;
|
348 |
-
int skipBin = (hix + 1) + loy * p.widthBins;
|
349 |
-
int endBin = lox + (hiy + 1) * p.widthBins;
|
350 |
-
int binYInc = p.widthBins - (hix - lox + 1);
|
351 |
-
|
352 |
-
// loop over triangle's bins
|
353 |
-
do
|
354 |
-
{
|
355 |
-
U32 outMask = s_outMask[threadIdx.y][currBin];
|
356 |
-
if (outMask & (1<<threadIdx.x))
|
357 |
-
{
|
358 |
-
int idx = __popc(outMask & getLaneMaskLt());
|
359 |
-
if (threadIdx.y > 0)
|
360 |
-
idx += s_outCount[threadIdx.y-1][currBin];
|
361 |
-
|
362 |
-
int base = s_outOfs[currBin];
|
363 |
-
int free = (-base) & (CR_BIN_SEG_SIZE - 1);
|
364 |
-
if (idx >= free)
|
365 |
-
idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
|
366 |
-
else
|
367 |
-
idx += base;
|
368 |
-
|
369 |
-
binSegData[idx] = s_triBuf[triPos];
|
370 |
-
}
|
371 |
-
|
372 |
-
currBin++;
|
373 |
-
if (currBin == skipBin)
|
374 |
-
currBin += binYInc, skipBin += p.widthBins;
|
375 |
-
}
|
376 |
-
while (currBin != endBin);
|
377 |
-
}
|
378 |
-
|
379 |
-
// wait all triangles to finish, then replace overflown segment offsets
|
380 |
-
__syncthreads();
|
381 |
-
if (thrInBlock < p.numBins)
|
382 |
-
{
|
383 |
-
U32 total = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
|
384 |
-
U32 oldOfs = s_outOfs[thrInBlock];
|
385 |
-
if (overIndex == -1)
|
386 |
-
s_outOfs[thrInBlock] = oldOfs + total;
|
387 |
-
else
|
388 |
-
{
|
389 |
-
int addr = oldOfs + total;
|
390 |
-
addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
|
391 |
-
addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
|
392 |
-
s_outOfs[thrInBlock] = addr;
|
393 |
-
}
|
394 |
-
s_outTotal[thrInBlock] += total;
|
395 |
-
}
|
396 |
-
|
397 |
-
// these triangles are now done
|
398 |
-
int count = ::min(bufCount, CR_BIN_WARPS * 32);
|
399 |
-
bufCount -= count;
|
400 |
-
bufIndex += count;
|
401 |
-
bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
|
402 |
-
}
|
403 |
-
while (bufCount > 0 || batchPos < batchEnd);
|
404 |
-
|
405 |
-
// flush all bins
|
406 |
-
if (thrInBlock < p.numBins)
|
407 |
-
{
|
408 |
-
int ofs = s_outOfs[thrInBlock];
|
409 |
-
if (ofs & (CR_BIN_SEG_SIZE-1))
|
410 |
-
{
|
411 |
-
int seg = ofs >> CR_BIN_SEG_LOG2;
|
412 |
-
binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
|
413 |
-
s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
|
414 |
-
}
|
415 |
-
}
|
416 |
-
}
|
417 |
-
|
418 |
-
// output totals
|
419 |
-
if (thrInBlock < p.numBins)
|
420 |
-
binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
|
421 |
-
}
|
422 |
-
|
423 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp
DELETED
@@ -1,94 +0,0 @@
|
|
1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
//
|
3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
// and proprietary rights in and to this software, related documentation
|
5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
// distribution of this software and related documentation without an express
|
7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
#include "../../framework.h"
|
10 |
-
#include "Buffer.hpp"
|
11 |
-
|
12 |
-
using namespace CR;
|
13 |
-
|
14 |
-
//------------------------------------------------------------------------
|
15 |
-
// GPU buffer.
|
16 |
-
//------------------------------------------------------------------------
|
17 |
-
|
18 |
-
Buffer::Buffer(void)
|
19 |
-
: m_gpuPtr(NULL),
|
20 |
-
m_bytes (0)
|
21 |
-
{
|
22 |
-
// empty
|
23 |
-
}
|
24 |
-
|
25 |
-
Buffer::~Buffer(void)
|
26 |
-
{
|
27 |
-
if (m_gpuPtr)
|
28 |
-
cudaFree(m_gpuPtr); // Don't throw an exception.
|
29 |
-
}
|
30 |
-
|
31 |
-
void Buffer::reset(size_t bytes)
|
32 |
-
{
|
33 |
-
if (bytes == m_bytes)
|
34 |
-
return;
|
35 |
-
|
36 |
-
if (m_gpuPtr)
|
37 |
-
{
|
38 |
-
NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
|
39 |
-
m_gpuPtr = NULL;
|
40 |
-
}
|
41 |
-
|
42 |
-
if (bytes > 0)
|
43 |
-
NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
|
44 |
-
|
45 |
-
m_bytes = bytes;
|
46 |
-
}
|
47 |
-
|
48 |
-
void Buffer::grow(size_t bytes)
|
49 |
-
{
|
50 |
-
if (bytes > m_bytes)
|
51 |
-
reset(bytes);
|
52 |
-
}
|
53 |
-
|
54 |
-
//------------------------------------------------------------------------
|
55 |
-
// Host buffer with page-locked memory.
|
56 |
-
//------------------------------------------------------------------------
|
57 |
-
|
58 |
-
HostBuffer::HostBuffer(void)
|
59 |
-
: m_hostPtr(NULL),
|
60 |
-
m_bytes (0)
|
61 |
-
{
|
62 |
-
// empty
|
63 |
-
}
|
64 |
-
|
65 |
-
HostBuffer::~HostBuffer(void)
|
66 |
-
{
|
67 |
-
if (m_hostPtr)
|
68 |
-
cudaFreeHost(m_hostPtr); // Don't throw an exception.
|
69 |
-
}
|
70 |
-
|
71 |
-
void HostBuffer::reset(size_t bytes)
|
72 |
-
{
|
73 |
-
if (bytes == m_bytes)
|
74 |
-
return;
|
75 |
-
|
76 |
-
if (m_hostPtr)
|
77 |
-
{
|
78 |
-
NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
|
79 |
-
m_hostPtr = NULL;
|
80 |
-
}
|
81 |
-
|
82 |
-
if (bytes > 0)
|
83 |
-
NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
|
84 |
-
|
85 |
-
m_bytes = bytes;
|
86 |
-
}
|
87 |
-
|
88 |
-
void HostBuffer::grow(size_t bytes)
|
89 |
-
{
|
90 |
-
if (bytes > m_bytes)
|
91 |
-
reset(bytes);
|
92 |
-
}
|
93 |
-
|
94 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
//
|
3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
// and proprietary rights in and to this software, related documentation
|
5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
// distribution of this software and related documentation without an express
|
7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
#pragma once
|
10 |
-
#include "Defs.hpp"
|
11 |
-
|
12 |
-
namespace CR
|
13 |
-
{
|
14 |
-
//------------------------------------------------------------------------
|
15 |
-
|
16 |
-
class Buffer
|
17 |
-
{
|
18 |
-
public:
|
19 |
-
Buffer (void);
|
20 |
-
~Buffer (void);
|
21 |
-
|
22 |
-
void reset (size_t bytes);
|
23 |
-
void grow (size_t bytes);
|
24 |
-
void* getPtr (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
|
25 |
-
size_t getSize (void) const { return m_bytes; }
|
26 |
-
|
27 |
-
void setPtr (void* ptr) { m_gpuPtr = ptr; }
|
28 |
-
|
29 |
-
private:
|
30 |
-
void* m_gpuPtr;
|
31 |
-
size_t m_bytes;
|
32 |
-
};
|
33 |
-
|
34 |
-
//------------------------------------------------------------------------
|
35 |
-
|
36 |
-
class HostBuffer
|
37 |
-
{
|
38 |
-
public:
|
39 |
-
HostBuffer (void);
|
40 |
-
~HostBuffer (void);
|
41 |
-
|
42 |
-
void reset (size_t bytes);
|
43 |
-
void grow (size_t bytes);
|
44 |
-
void* getPtr (void) { return m_hostPtr; }
|
45 |
-
size_t getSize (void) const { return m_bytes; }
|
46 |
-
|
47 |
-
void setPtr (void* ptr) { m_hostPtr = ptr; }
|
48 |
-
|
49 |
-
private:
|
50 |
-
void* m_hostPtr;
|
51 |
-
size_t m_bytes;
|
52 |
-
};
|
53 |
-
|
54 |
-
//------------------------------------------------------------------------
|
55 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl
DELETED
@@ -1,730 +0,0 @@
|
|
1 |
-
// Copyright (c) 2009-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
//
|
3 |
-
// NVIDIA CORPORATION and its licensors retain all intellectual property
|
4 |
-
// and proprietary rights in and to this software, related documentation
|
5 |
-
// and any modifications thereto. Any use, reproduction, disclosure or
|
6 |
-
// distribution of this software and related documentation without an express
|
7 |
-
// license agreement from NVIDIA CORPORATION is strictly prohibited.
|
8 |
-
|
9 |
-
//------------------------------------------------------------------------
|
10 |
-
|
11 |
-
__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
|
12 |
-
{
|
13 |
-
int tileX = tileInBin & (CR_BIN_SIZE - 1);
|
14 |
-
int tileY = tileInBin >> CR_BIN_LOG2;
|
15 |
-
return tileX + tileY * widthTiles;
|
16 |
-
}
|
17 |
-
|
18 |
-
//------------------------------------------------------------------------
|
19 |
-
|
20 |
-
__device__ __inline__ void coarseRasterImpl(const CRParams p)
|
21 |
-
{
|
22 |
-
// Common.
|
23 |
-
|
24 |
-
__shared__ volatile U32 s_workCounter;
|
25 |
-
__shared__ volatile U32 s_scanTemp [CR_COARSE_WARPS][48]; // 3KB
|
26 |
-
|
27 |
-
// Input.
|
28 |
-
|
29 |
-
__shared__ volatile U32 s_binOrder [CR_MAXBINS_SQR]; // 1KB
|
30 |
-
__shared__ volatile S32 s_binStreamCurrSeg [CR_BIN_STREAMS_SIZE]; // 0KB
|
31 |
-
__shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE]; // 0KB
|
32 |
-
__shared__ volatile S32 s_triQueue [CR_COARSE_QUEUE_SIZE]; // 4KB
|
33 |
-
__shared__ volatile S32 s_triQueueWritePos;
|
34 |
-
__shared__ volatile U32 s_binStreamSelectedOfs;
|
35 |
-
__shared__ volatile U32 s_binStreamSelectedSize;
|
36 |
-
|
37 |
-
// Output.
|
38 |
-
|
39 |
-
__shared__ volatile U32 s_warpEmitMask [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
|
40 |
-
__shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1]; // 16KB, +1 to avoid bank collisions
|
41 |
-
__shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1]; // 1KB, zero at the beginning
|
42 |
-
__shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1]; // 1KB, zero at the beginning
|
43 |
-
__shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR]; // 1KB
|
44 |
-
__shared__ volatile U32 s_firstAllocSeg;
|
45 |
-
__shared__ volatile U32 s_firstActiveIdx;
|
46 |
-
|
47 |
-
// Pointers and constants.
|
48 |
-
|
49 |
-
CRAtomics& atomics = p.atomics[blockIdx.z];
|
50 |
-
const CRTriangleHeader* triHeader = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
|
51 |
-
const S32* binFirstSeg = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
52 |
-
const S32* binTotal = (const S32*)p.binTotal + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
|
53 |
-
const S32* binSegData = (const S32*)p.binSegData + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
|
54 |
-
const S32* binSegNext = (const S32*)p.binSegNext + p.maxBinSegs * blockIdx.z;
|
55 |
-
const S32* binSegCount = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
|
56 |
-
S32* activeTiles = (S32*)p.activeTiles + CR_MAXTILES_SQR * blockIdx.z;
|
57 |
-
S32* tileFirstSeg = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
|
58 |
-
S32* tileSegData = (S32*)p.tileSegData + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
|
59 |
-
S32* tileSegNext = (S32*)p.tileSegNext + p.maxTileSegs * blockIdx.z;
|
60 |
-
S32* tileSegCount = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
|
61 |
-
|
62 |
-
int tileLog = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
|
63 |
-
int thrInBlock = threadIdx.x + threadIdx.y * 32;
|
64 |
-
int emitShift = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
|
65 |
-
|
66 |
-
if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
|
67 |
-
return;
|
68 |
-
|
69 |
-
// Initialize sharedmem arrays.
|
70 |
-
|
71 |
-
if (thrInBlock == 0)
|
72 |
-
{
|
73 |
-
s_tileEmitPrefixSum[0] = 0;
|
74 |
-
s_tileAllocPrefixSum[0] = 0;
|
75 |
-
}
|
76 |
-
s_scanTemp[threadIdx.y][threadIdx.x] = 0;
|
77 |
-
|
78 |
-
// Sort bins in descending order of triangle count.
|
79 |
-
|
80 |
-
for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
|
81 |
-
{
|
82 |
-
int count = 0;
|
83 |
-
for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
|
84 |
-
count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
|
85 |
-
s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
|
86 |
-
}
|
87 |
-
|
88 |
-
__syncthreads();
|
89 |
-
sortShared(s_binOrder, p.numBins);
|
90 |
-
|
91 |
-
// Process each bin by one block.
|
92 |
-
|
93 |
-
for (;;)
|
94 |
-
{
|
95 |
-
// Pick a bin for the block.
|
96 |
-
|
97 |
-
if (thrInBlock == 0)
|
98 |
-
s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
|
99 |
-
__syncthreads();
|
100 |
-
|
101 |
-
int workCounter = s_workCounter;
|
102 |
-
if (workCounter >= p.numBins)
|
103 |
-
break;
|
104 |
-
|
105 |
-
U32 binOrder = s_binOrder[workCounter];
|
106 |
-
bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
|
107 |
-
if (binEmpty && !p.deferredClear)
|
108 |
-
break;
|
109 |
-
|
110 |
-
int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
|
111 |
-
|
112 |
-
// Initialize input/output streams.
|
113 |
-
|
114 |
-
int triQueueWritePos = 0;
|
115 |
-
int triQueueReadPos = 0;
|
116 |
-
|
117 |
-
if (thrInBlock < CR_BIN_STREAMS_SIZE)
|
118 |
-
{
|
119 |
-
int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
|
120 |
-
s_binStreamCurrSeg[thrInBlock] = segIdx;
|
121 |
-
s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
|
122 |
-
}
|
123 |
-
|
124 |
-
for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
125 |
-
s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
|
126 |
-
|
127 |
-
// Initialize per-bin state.
|
128 |
-
|
129 |
-
int binY = idiv_fast(binIdx, p.widthBins);
|
130 |
-
int binX = binIdx - binY * p.widthBins;
|
131 |
-
int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
132 |
-
int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
|
133 |
-
int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
|
134 |
-
int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
|
135 |
-
int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
|
136 |
-
|
137 |
-
// Entire block: Merge input streams and process triangles.
|
138 |
-
|
139 |
-
if (!binEmpty)
|
140 |
-
do
|
141 |
-
{
|
142 |
-
//------------------------------------------------------------------------
|
143 |
-
// Merge.
|
144 |
-
//------------------------------------------------------------------------
|
145 |
-
|
146 |
-
// Entire block: Not enough triangles => merge and queue segments.
|
147 |
-
// NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
|
148 |
-
|
149 |
-
while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
|
150 |
-
{
|
151 |
-
// First warp: Choose the segment with the lowest initial triangle index.
|
152 |
-
|
153 |
-
bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
|
154 |
-
U32 hasStreamMask = __ballot_sync(~0u, hasStream);
|
155 |
-
if (hasStream)
|
156 |
-
{
|
157 |
-
// Find the stream with the lowest triangle index.
|
158 |
-
|
159 |
-
U32 firstTri = s_binStreamFirstTri[thrInBlock];
|
160 |
-
U32 t = firstTri;
|
161 |
-
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
162 |
-
|
163 |
-
#if (CR_BIN_STREAMS_SIZE > 1)
|
164 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
|
165 |
-
#endif
|
166 |
-
#if (CR_BIN_STREAMS_SIZE > 2)
|
167 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
|
168 |
-
#endif
|
169 |
-
#if (CR_BIN_STREAMS_SIZE > 4)
|
170 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
|
171 |
-
#endif
|
172 |
-
#if (CR_BIN_STREAMS_SIZE > 8)
|
173 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
|
174 |
-
#endif
|
175 |
-
#if (CR_BIN_STREAMS_SIZE > 16)
|
176 |
-
v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
|
177 |
-
#endif
|
178 |
-
v[0] = t; __syncwarp(hasStreamMask);
|
179 |
-
|
180 |
-
// Consume and broadcast.
|
181 |
-
|
182 |
-
bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
|
183 |
-
U32 firstMask = __ballot_sync(hasStreamMask, first);
|
184 |
-
if (first && (firstMask >> threadIdx.x) == 1u)
|
185 |
-
{
|
186 |
-
int segIdx = s_binStreamCurrSeg[thrInBlock];
|
187 |
-
s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
|
188 |
-
if (segIdx != -1)
|
189 |
-
{
|
190 |
-
int segSize = binSegCount[segIdx];
|
191 |
-
int segNext = binSegNext[segIdx];
|
192 |
-
s_binStreamSelectedSize = segSize;
|
193 |
-
s_triQueueWritePos = triQueueWritePos + segSize;
|
194 |
-
s_binStreamCurrSeg[thrInBlock] = segNext;
|
195 |
-
s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
|
196 |
-
}
|
197 |
-
}
|
198 |
-
}
|
199 |
-
|
200 |
-
// No more segments => break.
|
201 |
-
|
202 |
-
__syncthreads();
|
203 |
-
triQueueWritePos = s_triQueueWritePos;
|
204 |
-
int segOfs = s_binStreamSelectedOfs;
|
205 |
-
if (segOfs < 0)
|
206 |
-
break;
|
207 |
-
|
208 |
-
int segSize = s_binStreamSelectedSize;
|
209 |
-
__syncthreads();
|
210 |
-
|
211 |
-
// Fetch triangles into the queue.
|
212 |
-
|
213 |
-
for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
|
214 |
-
{
|
215 |
-
S32 triIdx = binSegData[segOfs + idxInSeg];
|
216 |
-
s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
|
217 |
-
}
|
218 |
-
}
|
219 |
-
|
220 |
-
// All threads: Clear emit masks.
|
221 |
-
|
222 |
-
for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
|
223 |
-
s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
|
224 |
-
|
225 |
-
__syncthreads();
|
226 |
-
|
227 |
-
//------------------------------------------------------------------------
|
228 |
-
// Raster.
|
229 |
-
//------------------------------------------------------------------------
|
230 |
-
|
231 |
-
// Triangle per thread: Read from the queue.
|
232 |
-
|
233 |
-
int triIdx = -1;
|
234 |
-
if (triQueueReadPos + thrInBlock < triQueueWritePos)
|
235 |
-
triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
|
236 |
-
|
237 |
-
uint4 triData = make_uint4(0, 0, 0, 0);
|
238 |
-
if (triIdx != -1)
|
239 |
-
{
|
240 |
-
int dataIdx = triIdx >> 3;
|
241 |
-
int subtriIdx = triIdx & 7;
|
242 |
-
if (subtriIdx != 7)
|
243 |
-
dataIdx = triHeader[dataIdx].misc + subtriIdx;
|
244 |
-
triData = *((uint4*)triHeader + dataIdx);
|
245 |
-
}
|
246 |
-
|
247 |
-
// 32 triangles per warp: Record emits (= tile intersections).
|
248 |
-
|
249 |
-
if (__any_sync(~0u, triIdx != -1))
|
250 |
-
{
|
251 |
-
S32 v0x = sub_s16lo_s16lo(triData.x, originX);
|
252 |
-
S32 v0y = sub_s16hi_s16lo(triData.x, originY);
|
253 |
-
S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
|
254 |
-
S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
|
255 |
-
S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
|
256 |
-
S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
|
257 |
-
|
258 |
-
// Compute tile-based AABB.
|
259 |
-
|
260 |
-
int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
|
261 |
-
int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
|
262 |
-
int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
|
263 |
-
int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
|
264 |
-
int sizex = add_sub(hix, 1, lox);
|
265 |
-
int sizey = add_sub(hiy, 1, loy);
|
266 |
-
int area = sizex * sizey;
|
267 |
-
|
268 |
-
// Miscellaneous init.
|
269 |
-
|
270 |
-
U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
|
271 |
-
int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
|
272 |
-
U32 maskBit = 1 << threadIdx.x;
|
273 |
-
|
274 |
-
// Case A: All AABBs are small => record the full AABB using atomics.
|
275 |
-
|
276 |
-
if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
|
277 |
-
{
|
278 |
-
if (triIdx != -1)
|
279 |
-
{
|
280 |
-
atomicOr((U32*)currPtr, maskBit);
|
281 |
-
if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
|
282 |
-
if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
|
283 |
-
if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
|
284 |
-
}
|
285 |
-
}
|
286 |
-
else
|
287 |
-
{
|
288 |
-
// Compute warp-AABB (scan-32).
|
289 |
-
|
290 |
-
U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
|
291 |
-
if (triIdx == -1)
|
292 |
-
aabbMask = 0;
|
293 |
-
|
294 |
-
volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
|
295 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
|
296 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
|
297 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
|
298 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
|
299 |
-
v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
|
300 |
-
v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
|
301 |
-
|
302 |
-
U32 maskX = aabbMask & 0xFFFF;
|
303 |
-
U32 maskY = aabbMask >> 16;
|
304 |
-
int wlox = findLeadingOne(maskX ^ (maskX - 1));
|
305 |
-
int wloy = findLeadingOne(maskY ^ (maskY - 1));
|
306 |
-
int whix = findLeadingOne(maskX);
|
307 |
-
int whiy = findLeadingOne(maskY);
|
308 |
-
int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
|
309 |
-
|
310 |
-
// Initialize edge functions.
|
311 |
-
|
312 |
-
S32 d12x = d02x - d01x;
|
313 |
-
S32 d12y = d02y - d01y;
|
314 |
-
v0x -= lox << tileLog;
|
315 |
-
v0y -= loy << tileLog;
|
316 |
-
|
317 |
-
S32 t01 = v0x * d01y - v0y * d01x;
|
318 |
-
S32 t02 = v0y * d02x - v0x * d02y;
|
319 |
-
S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
|
320 |
-
S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
|
321 |
-
S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
|
322 |
-
S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
|
323 |
-
|
324 |
-
d01x += sizex * d01y;
|
325 |
-
d02x += sizex * d02y;
|
326 |
-
d12x += sizex * d12y;
|
327 |
-
|
328 |
-
// Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
|
329 |
-
if (__any_sync(~0u, warea * 4 <= area * 8))
|
330 |
-
{
|
331 |
-
// Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
|
332 |
-
bool act = (triIdx != -1);
|
333 |
-
U32 actMask = __ballot_sync(~0u, act);
|
334 |
-
if (act)
|
335 |
-
{
|
336 |
-
for (int y = wloy; y <= whiy; y++)
|
337 |
-
{
|
338 |
-
bool yIn = (y >= loy && y <= hiy);
|
339 |
-
U32 yMask = __ballot_sync(actMask, yIn);
|
340 |
-
if (yIn)
|
341 |
-
{
|
342 |
-
for (int x = wlox; x <= whix; x++)
|
343 |
-
{
|
344 |
-
bool xyIn = (x >= lox && x <= hix);
|
345 |
-
U32 xyMask = __ballot_sync(yMask, xyIn);
|
346 |
-
if (xyIn)
|
347 |
-
{
|
348 |
-
U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
|
349 |
-
if (threadIdx.x == 31 - __clz(xyMask))
|
350 |
-
*(U32*)currPtr = res;
|
351 |
-
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
352 |
-
}
|
353 |
-
}
|
354 |
-
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
|
355 |
-
}
|
356 |
-
}
|
357 |
-
}
|
358 |
-
}
|
359 |
-
|
360 |
-
// Case C: General case => Check tiles in AABB, record using atomics.
|
361 |
-
|
362 |
-
else
|
363 |
-
{
|
364 |
-
if (triIdx != -1)
|
365 |
-
{
|
366 |
-
U8* skipPtr = currPtr + (sizex << 2);
|
367 |
-
U8* endPtr = currPtr + (sizey << (CR_BIN_LOG2 + 2));
|
368 |
-
do
|
369 |
-
{
|
370 |
-
if (b01 >= 0 && b02 >= 0 && b12 >= 0)
|
371 |
-
atomicOr((U32*)currPtr, maskBit);
|
372 |
-
currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
|
373 |
-
if (currPtr == skipPtr)
|
374 |
-
currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
|
375 |
-
}
|
376 |
-
while (currPtr != endPtr);
|
377 |
-
}
|
378 |
-
}
|
379 |
-
}
|
380 |
-
}
|
381 |
-
|
382 |
-
__syncthreads();
|
383 |
-
|
384 |
-
//------------------------------------------------------------------------
|
385 |
-
// Count.
|
386 |
-
//------------------------------------------------------------------------
|
387 |
-
|
388 |
-
// Tile per thread: Initialize prefix sums.
|
389 |
-
|
390 |
-
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
391 |
-
{
|
392 |
-
int tileInBin = tileInBin_base + thrInBlock;
|
393 |
-
bool act = (tileInBin < CR_BIN_SQR);
|
394 |
-
U32 actMask = __ballot_sync(~0u, act);
|
395 |
-
if (act)
|
396 |
-
{
|
397 |
-
// Compute prefix sum of emits over warps.
|
398 |
-
|
399 |
-
U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
|
400 |
-
U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
|
401 |
-
int tileEmits = 0;
|
402 |
-
for (int i = 0; i < CR_COARSE_WARPS; i++)
|
403 |
-
{
|
404 |
-
tileEmits += __popc(*(U32*)srcPtr);
|
405 |
-
*(U32*)dstPtr = tileEmits;
|
406 |
-
srcPtr += (CR_BIN_SQR + 1) * 4;
|
407 |
-
dstPtr += (CR_BIN_SQR + 1) * 4;
|
408 |
-
}
|
409 |
-
|
410 |
-
// Determine the number of segments to allocate.
|
411 |
-
|
412 |
-
int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
|
413 |
-
int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
|
414 |
-
volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
|
415 |
-
|
416 |
-
// All counters within the warp are small => compute prefix sum using ballot.
|
417 |
-
|
418 |
-
if (!__any_sync(actMask, tileEmits >= 2))
|
419 |
-
{
|
420 |
-
U32 m = getLaneMaskLe();
|
421 |
-
*v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
|
422 |
-
}
|
423 |
-
|
424 |
-
// Otherwise => scan-32 within the warp.
|
425 |
-
|
426 |
-
else
|
427 |
-
{
|
428 |
-
U32 sum = (tileEmits << emitShift) | tileAllocs;
|
429 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 1) sum += v[-1]; __syncwarp(actMask);
|
430 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 2) sum += v[-2]; __syncwarp(actMask);
|
431 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 4) sum += v[-4]; __syncwarp(actMask);
|
432 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 8) sum += v[-8]; __syncwarp(actMask);
|
433 |
-
*v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
|
434 |
-
*v = sum; __syncwarp(actMask);
|
435 |
-
}
|
436 |
-
}
|
437 |
-
}
|
438 |
-
|
439 |
-
// First warp: Scan-8.
|
440 |
-
|
441 |
-
__syncthreads();
|
442 |
-
|
443 |
-
bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
|
444 |
-
U32 scan8Mask = __ballot_sync(~0u, scan8);
|
445 |
-
if (scan8)
|
446 |
-
{
|
447 |
-
int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
|
448 |
-
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
449 |
-
v[0] = sum; __syncwarp(scan8Mask);
|
450 |
-
#if (CR_BIN_SQR > 1 * 32)
|
451 |
-
sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
452 |
-
#endif
|
453 |
-
#if (CR_BIN_SQR > 2 * 32)
|
454 |
-
sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
455 |
-
#endif
|
456 |
-
#if (CR_BIN_SQR > 4 * 32)
|
457 |
-
sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
458 |
-
#endif
|
459 |
-
}
|
460 |
-
|
461 |
-
__syncthreads();
|
462 |
-
|
463 |
-
// Tile per thread: Finalize prefix sums.
|
464 |
-
// Single thread: Allocate segments.
|
465 |
-
|
466 |
-
for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
467 |
-
{
|
468 |
-
int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
|
469 |
-
int numEmits = sum >> emitShift;
|
470 |
-
int numAllocs = sum & ((1 << emitShift) - 1);
|
471 |
-
s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
|
472 |
-
s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
|
473 |
-
|
474 |
-
if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
|
475 |
-
{
|
476 |
-
int t = atomicAdd(&atomics.numTileSegs, numAllocs);
|
477 |
-
s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
|
478 |
-
}
|
479 |
-
}
|
480 |
-
|
481 |
-
__syncthreads();
|
482 |
-
int firstAllocSeg = s_firstAllocSeg;
|
483 |
-
int totalEmits = s_tileEmitPrefixSum[CR_BIN_SQR];
|
484 |
-
int totalAllocs = s_tileAllocPrefixSum[CR_BIN_SQR];
|
485 |
-
|
486 |
-
//------------------------------------------------------------------------
|
487 |
-
// Emit.
|
488 |
-
//------------------------------------------------------------------------
|
489 |
-
|
490 |
-
// Emit per thread: Write triangle index to globalmem.
|
491 |
-
|
492 |
-
for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
|
493 |
-
{
|
494 |
-
// Find tile in bin.
|
495 |
-
|
496 |
-
U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
|
497 |
-
U8* tilePtr = tileBase;
|
498 |
-
U8* ptr;
|
499 |
-
|
500 |
-
#if (CR_BIN_SQR > 128)
|
501 |
-
ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
502 |
-
#endif
|
503 |
-
#if (CR_BIN_SQR > 64)
|
504 |
-
ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
505 |
-
#endif
|
506 |
-
#if (CR_BIN_SQR > 32)
|
507 |
-
ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
508 |
-
#endif
|
509 |
-
#if (CR_BIN_SQR > 16)
|
510 |
-
ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
511 |
-
#endif
|
512 |
-
#if (CR_BIN_SQR > 8)
|
513 |
-
ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
514 |
-
#endif
|
515 |
-
#if (CR_BIN_SQR > 4)
|
516 |
-
ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
517 |
-
#endif
|
518 |
-
#if (CR_BIN_SQR > 2)
|
519 |
-
ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
520 |
-
#endif
|
521 |
-
#if (CR_BIN_SQR > 1)
|
522 |
-
ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
|
523 |
-
#endif
|
524 |
-
|
525 |
-
int tileInBin = (tilePtr - tileBase) >> 2;
|
526 |
-
int emitInTile = emitInBin - *(U32*)tilePtr;
|
527 |
-
|
528 |
-
// Find warp in tile.
|
529 |
-
|
530 |
-
int warpStep = (CR_BIN_SQR + 1) * 4;
|
531 |
-
U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
|
532 |
-
U8* warpPtr = warpBase;
|
533 |
-
|
534 |
-
#if (CR_COARSE_WARPS > 8)
|
535 |
-
ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
536 |
-
#endif
|
537 |
-
#if (CR_COARSE_WARPS > 4)
|
538 |
-
ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
539 |
-
#endif
|
540 |
-
#if (CR_COARSE_WARPS > 2)
|
541 |
-
ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
542 |
-
#endif
|
543 |
-
#if (CR_COARSE_WARPS > 1)
|
544 |
-
ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
|
545 |
-
#endif
|
546 |
-
|
547 |
-
int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
|
548 |
-
U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
|
549 |
-
int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
|
550 |
-
|
551 |
-
// Find thread in warp.
|
552 |
-
|
553 |
-
int threadInWarp = 0;
|
554 |
-
int pop = __popc(emitMask & 0xFFFF);
|
555 |
-
bool pred = (emitInWarp >= pop);
|
556 |
-
if (pred) emitInWarp -= pop;
|
557 |
-
if (pred) emitMask >>= 0x10;
|
558 |
-
if (pred) threadInWarp += 0x10;
|
559 |
-
|
560 |
-
pop = __popc(emitMask & 0xFF);
|
561 |
-
pred = (emitInWarp >= pop);
|
562 |
-
if (pred) emitInWarp -= pop;
|
563 |
-
if (pred) emitMask >>= 0x08;
|
564 |
-
if (pred) threadInWarp += 0x08;
|
565 |
-
|
566 |
-
pop = __popc(emitMask & 0xF);
|
567 |
-
pred = (emitInWarp >= pop);
|
568 |
-
if (pred) emitInWarp -= pop;
|
569 |
-
if (pred) emitMask >>= 0x04;
|
570 |
-
if (pred) threadInWarp += 0x04;
|
571 |
-
|
572 |
-
pop = __popc(emitMask & 0x3);
|
573 |
-
pred = (emitInWarp >= pop);
|
574 |
-
if (pred) emitInWarp -= pop;
|
575 |
-
if (pred) emitMask >>= 0x02;
|
576 |
-
if (pred) threadInWarp += 0x02;
|
577 |
-
|
578 |
-
if (emitInWarp >= (emitMask & 1))
|
579 |
-
threadInWarp++;
|
580 |
-
|
581 |
-
// Figure out where to write.
|
582 |
-
|
583 |
-
int currOfs = s_tileStreamCurrOfs[tileInBin];
|
584 |
-
int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
|
585 |
-
int outOfs = emitInTile;
|
586 |
-
|
587 |
-
if (outOfs < spaceLeft)
|
588 |
-
outOfs += currOfs;
|
589 |
-
else
|
590 |
-
{
|
591 |
-
int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
|
592 |
-
outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
|
593 |
-
}
|
594 |
-
|
595 |
-
// Write.
|
596 |
-
|
597 |
-
int queueIdx = warpInTile * 32 + threadInWarp;
|
598 |
-
int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
|
599 |
-
|
600 |
-
tileSegData[outOfs] = triIdx;
|
601 |
-
}
|
602 |
-
|
603 |
-
//------------------------------------------------------------------------
|
604 |
-
// Patch.
|
605 |
-
//------------------------------------------------------------------------
|
606 |
-
|
607 |
-
// Allocated segment per thread: Initialize next-pointer and count.
|
608 |
-
|
609 |
-
for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
|
610 |
-
{
|
611 |
-
int segIdx = firstAllocSeg + i;
|
612 |
-
tileSegNext[segIdx] = segIdx + 1;
|
613 |
-
tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
|
614 |
-
}
|
615 |
-
|
616 |
-
// Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
|
617 |
-
|
618 |
-
__syncthreads();
|
619 |
-
for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
|
620 |
-
{
|
621 |
-
int oldOfs = s_tileStreamCurrOfs[tileInBin];
|
622 |
-
int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
|
623 |
-
int allocLo = s_tileAllocPrefixSum[tileInBin];
|
624 |
-
int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
|
625 |
-
|
626 |
-
if (allocLo != allocHi)
|
627 |
-
{
|
628 |
-
S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
|
629 |
-
if (oldOfs < 0)
|
630 |
-
nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
|
631 |
-
*nextPtr = firstAllocSeg + allocLo;
|
632 |
-
|
633 |
-
newOfs--;
|
634 |
-
newOfs &= CR_TILE_SEG_SIZE - 1;
|
635 |
-
newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
|
636 |
-
newOfs++;
|
637 |
-
}
|
638 |
-
s_tileStreamCurrOfs[tileInBin] = newOfs;
|
639 |
-
}
|
640 |
-
|
641 |
-
// Advance queue read pointer.
|
642 |
-
// Queue became empty => bin done.
|
643 |
-
|
644 |
-
triQueueReadPos += CR_COARSE_WARPS * 32;
|
645 |
-
}
|
646 |
-
while (triQueueReadPos < triQueueWritePos);
|
647 |
-
|
648 |
-
// Tile per thread: Fix next-pointer and count of the last segment.
|
649 |
-
// 32 tiles per warp: Count active tiles.
|
650 |
-
|
651 |
-
__syncthreads();
|
652 |
-
|
653 |
-
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
654 |
-
{
|
655 |
-
int tileInBin = tileInBin_base + thrInBlock;
|
656 |
-
bool act = (tileInBin < CR_BIN_SQR);
|
657 |
-
U32 actMask = __ballot_sync(~0u, act);
|
658 |
-
if (act)
|
659 |
-
{
|
660 |
-
int tileX = tileInBin & (CR_BIN_SIZE - 1);
|
661 |
-
int tileY = tileInBin >> CR_BIN_LOG2;
|
662 |
-
bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
|
663 |
-
|
664 |
-
int ofs = s_tileStreamCurrOfs[tileInBin];
|
665 |
-
int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
|
666 |
-
int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
|
667 |
-
|
668 |
-
if (ofs >= 0)
|
669 |
-
tileSegNext[segIdx] = -1;
|
670 |
-
else if (force)
|
671 |
-
{
|
672 |
-
s_tileStreamCurrOfs[tileInBin] = 0;
|
673 |
-
tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
|
674 |
-
}
|
675 |
-
|
676 |
-
if (segCount != 0)
|
677 |
-
tileSegCount[segIdx] = segCount;
|
678 |
-
|
679 |
-
U32 res = __ballot_sync(actMask, ofs >= 0 | force);
|
680 |
-
if (threadIdx.x == 0)
|
681 |
-
s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
|
682 |
-
}
|
683 |
-
}
|
684 |
-
|
685 |
-
// First warp: Scan-8.
|
686 |
-
// One thread: Allocate space for active tiles.
|
687 |
-
|
688 |
-
__syncthreads();
|
689 |
-
|
690 |
-
bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
|
691 |
-
U32 scan8Mask = __ballot_sync(~0u, scan8);
|
692 |
-
if (scan8)
|
693 |
-
{
|
694 |
-
volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
|
695 |
-
U32 sum = v[0];
|
696 |
-
#if (CR_BIN_SQR > 1 * 32)
|
697 |
-
sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
698 |
-
#endif
|
699 |
-
#if (CR_BIN_SQR > 2 * 32)
|
700 |
-
sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
701 |
-
#endif
|
702 |
-
#if (CR_BIN_SQR > 4 * 32)
|
703 |
-
sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
|
704 |
-
#endif
|
705 |
-
|
706 |
-
if (thrInBlock == CR_BIN_SQR / 32 - 1)
|
707 |
-
s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
|
708 |
-
}
|
709 |
-
|
710 |
-
// Tile per thread: Output active tiles.
|
711 |
-
|
712 |
-
__syncthreads();
|
713 |
-
|
714 |
-
for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
|
715 |
-
{
|
716 |
-
int tileInBin = tileInBin_base + thrInBlock;
|
717 |
-
bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
|
718 |
-
U32 actMask = __ballot_sync(~0u, act);
|
719 |
-
if (act)
|
720 |
-
{
|
721 |
-
int activeIdx = s_firstActiveIdx;
|
722 |
-
activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
|
723 |
-
activeIdx += __popc(actMask & getLaneMaskLt());
|
724 |
-
activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
|
725 |
-
}
|
726 |
-
}
|
727 |
-
}
|
728 |
-
}
|
729 |
-
|
730 |
-
//------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|