Spaces:

geyik1
/

dnm3d

Running

App Files Files Community

geyik1 commited on Jan 31

Commit

01b6284

verified ·

1 Parent(s): 226248d

Delete ginipick:SORA-3D

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ginipick:SORA-3D/3d.mp4 +0 -3
ginipick:SORA-3D/README.md +0 -11
ginipick:SORA-3D/app.py +0 -2
ginipick:SORA-3D/assets/.DS_Store +0 -0
ginipick:SORA-3D/assets/example_image/.DS_Store +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp +0 -3
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp +0 -3
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp +0 -0
ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp +0 -0
ginipick:SORA-3D/extensions/.DS_Store +0 -0
ginipick:SORA-3D/extensions/extensions_nvdiffrast_LICENSE.txt +0 -97
ginipick:SORA-3D/extensions/extensions_nvdiffrast_README.md +0 -42
ginipick:SORA-3D/extensions/extensions_nvdiffrast_run_sample.sh +0 -52
ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup copy.py +0 -51
ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup.py +0 -82
ginipick:SORA-3D/extensions/nvdiffrast/.DS_Store +0 -0
ginipick:SORA-3D/extensions/nvdiffrast/common/.DS_Store +0 -0
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/.DS_Store +0 -0
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp +0 -63
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/.DS_Store +0 -0
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl +0 -423
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp +0 -94
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp +0 -55
ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl +0 -730

ginipick:SORA-3D/3d.mp4 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6c3282465210bac76f44b605956139679ed774c8bad9be686707d1b770961371
-size 21309978

ginipick:SORA-3D/README.md DELETED Viewed

@@ -1,11 +0,0 @@
----
-title: SORA 3D
-emoji: 🏢🏆
-colorFrom: indigo
-colorTo: blue
-sdk: gradio
-sdk_version: 4.44.1
-app_file: app.py
-pinned: false
-short_description: Create top-quality 3D(.GLB) models from text or images
----

ginipick:SORA-3D/app.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- import os
2	- exec(os.environ.get('APP'))

ginipick:SORA-3D/assets/.DS_Store DELETED Viewed

Binary file (6.15 kB)

ginipick:SORA-3D/assets/example_image/.DS_Store DELETED Viewed

Binary file (12.3 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T120910.945.webp DELETED Viewed

Binary file (93 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133209.680.webp DELETED Viewed

Binary file (48.5 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133232.481.webp DELETED Viewed

Binary file (63.9 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133327.828.webp DELETED Viewed

Binary file (20.7 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133551.674.webp DELETED Viewed

Binary file (42.9 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133554.085.webp DELETED Viewed

Binary file (32.6 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133942.986.webp DELETED Viewed

Binary file (41.9 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T133945.143.webp DELETED Viewed

Binary file (42.4 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134251.217.webp DELETED Viewed

Binary file (14 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134253.975.webp DELETED Viewed

Binary file (51 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134602.793.webp DELETED Viewed

Binary file (33.3 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-08T134606.919.webp DELETED Viewed

Binary file (56.1 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T050638.566.webp DELETED Viewed

Binary file (58.6 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T102148.803.webp DELETED Viewed

Binary file (11 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T124050.873.webp DELETED Viewed

Binary file (58.5 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125348.492.webp DELETED Viewed

Binary file (47.6 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125709.810.webp DELETED Viewed

Binary file (24.7 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T125745.419.webp DELETED Viewed

Binary file (38.7 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T131128.626.webp DELETED Viewed

Binary file (49.6 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T174905.915.webp DELETED Viewed

Binary file (46.2 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184202.582.webp DELETED Viewed

Binary file (63.1 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184251.254.webp DELETED Viewed

Git LFS Details

SHA256: 04a741b7588b46f6f885987fa3330d51f671d7f372eedf3cc007e69fd1a2e3e9
Pointer size: 131 Bytes
Size of remote file: 113 kB

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184336.200.webp DELETED Viewed

Binary file (33.4 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184407.431.webp DELETED Viewed

Binary file (95 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184511.907.webp DELETED Viewed

Git LFS Details

SHA256: f5cbfa61ca24164cafbd695aa6f12b617196a64f913e5c9964fad60a74dedda6
Pointer size: 131 Bytes
Size of remote file: 101 kB

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184535.205.webp DELETED Viewed

Binary file (57.5 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-09T184804.224.webp DELETED Viewed

Binary file (88.8 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T033838.708.webp DELETED Viewed

Binary file (19 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034054.527.webp DELETED Viewed

Binary file (12.6 kB)

ginipick:SORA-3D/assets/example_image/assets_example_image_image - 2024-12-10T034505.337.webp DELETED Viewed

Binary file (8.18 kB)

ginipick:SORA-3D/extensions/.DS_Store DELETED Viewed

Binary file (6.15 kB)

ginipick:SORA-3D/extensions/extensions_nvdiffrast_LICENSE.txt DELETED Viewed

@@ -1,97 +0,0 @@
-Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
-Nvidia Source Code License (1-Way Commercial)
-=======================================================================
-1. Definitions
-"Licensor" means any person or entity that distributes its Work.
-"Software" means the original work of authorship made available under
-this License.
-"Work" means the Software and any additions to or derivative works of
-the Software that are made available under this License.
-The terms "reproduce," "reproduction," "derivative works," and
-"distribution" have the meaning as provided under U.S. copyright law;
-provided, however, that for the purposes of this License, derivative
-works shall not include works that remain separable from, or merely
-link (or bind by name) to the interfaces of, the Work.
-Works, including the Software, are "made available" under this License
-by including in or with the Work either (a) a copyright notice
-referencing the applicability of this License to the Work, or (b) a
-copy of this License.
-2. License Grants
-    2.1 Copyright Grant. Subject to the terms and conditions of this
-    License, each Licensor grants to you a perpetual, worldwide,
-    non-exclusive, royalty-free, copyright license to reproduce,
-    prepare derivative works of, publicly display, publicly perform,
-    sublicense and distribute its Work and any resulting derivative
-    works in any form.
-3. Limitations
-    3.1 Redistribution. You may reproduce or distribute the Work only
-    if (a) you do so under this License, (b) you include a complete
-    copy of this License with your distribution, and (c) you retain
-    without modification any copyright, patent, trademark, or
-    attribution notices that are present in the Work.
-    3.2 Derivative Works. You may specify that additional or different
-    terms apply to the use, reproduction, and distribution of your
-    derivative works of the Work ("Your Terms") only if (a) Your Terms
-    provide that the use limitation in Section 3.3 applies to your
-    derivative works, and (b) you identify the specific derivative
-    works that are subject to Your Terms. Notwithstanding Your Terms,
-    this License (including the redistribution requirements in Section
-    3.1) will continue to apply to the Work itself.
-    3.3 Use Limitation. The Work and any derivative works thereof only
-    may be used or intended for use non-commercially. The Work or
-    derivative works thereof may be used or intended for use by Nvidia
-    or its affiliates commercially or non-commercially. As used herein,
-    "non-commercially" means for research or evaluation purposes only
-    and not for any direct or indirect monetary gain.
-    3.4 Patent Claims. If you bring or threaten to bring a patent claim
-    against any Licensor (including any claim, cross-claim or
-    counterclaim in a lawsuit) to enforce any patents that you allege
-    are infringed by any Work, then your rights under this License from
-    such Licensor (including the grant in Section 2.1) will terminate
-    immediately.
-    3.5 Trademarks. This License does not grant any rights to use any
-    Licensor's or its affiliates' names, logos, or trademarks, except
-    as necessary to reproduce the notices described in this License.
-    3.6 Termination. If you violate any term of this License, then your
-    rights under this License (including the grant in Section 2.1) will
-    terminate immediately.
-4. Disclaimer of Warranty.
-THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
-NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
-THIS LICENSE.
-5. Limitation of Liability.
-EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
-THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
-SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
-OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
-(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
-COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
-THE POSSIBILITY OF SUCH DAMAGES.
-=======================================================================

ginipick:SORA-3D/extensions/extensions_nvdiffrast_README.md DELETED Viewed

@@ -1,42 +0,0 @@
-## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
-![Teaser image](./docs/img/teaser.png)
-**Modular Primitives for High-Performance Differentiable Rendering**<br>
-Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
-[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
-Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
-Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
-## Licenses
-Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
-This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
-For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
-We do not currently accept outside code contributions in the form of pull requests.
-Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
-[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
-originally shared under
-[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
-Mesh and texture stored as part of `samples/data/earth.npz` are derived from
-[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
-model originally made available under
-[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
-## Citation
-```
-@article{Laine2020diffrast,
-  title   = {Modular Primitives for High-Performance Differentiable Rendering},
-  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
-  journal = {ACM Transactions on Graphics},
-  year    = {2020},
-  volume  = {39},
-  number  = {6}
-}
-```

ginipick:SORA-3D/extensions/extensions_nvdiffrast_run_sample.sh DELETED Viewed

@@ -1,52 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-function print_help {
-    echo "Usage: `basename $0` [--build-container] <python_file>"
-    echo ""
-    echo "Option --build-container will build the Docker container based on"
-    echo "docker/Dockerfile and tag the image with gltorch:latest."
-    echo ""
-    echo "Example: `basename $0` samples/torch/envphong.py"
-}
-build_container=0
-sample=""
-while [[ "$#" -gt 0 ]]; do
-    case $1 in
-        --build-container) build_container=1;;
-        -h|--help) print_help; exit 0 ;;
-        --*) echo "Unknown parameter passed: $1"; exit 1 ;;
-        *) sample="$1"; shift; break;
-    esac
-    shift
-done
-rest=$@
-# Build the docker container
-if [ "$build_container" = "1" ]; then
-    docker build --tag gltorch:latest -f docker/Dockerfile .
-fi
-if [ ! -f "$sample" ]; then
-    echo
-    echo "No python sample given or file '$sample' not found.  Exiting."
-    exit 1
-fi
-image="gltorch:latest"
-echo "Using container image: $image"
-echo "Running command: $sample $rest"
-# Run a sample with docker
-docker run --rm -it --gpus all --user $(id -u):$(id -g) \
-    -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest

ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup copy.py DELETED Viewed

@@ -1,51 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-import nvdiffrast
-import setuptools
-import os
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-setuptools.setup(
-    name="nvdiffrast",
-    version=nvdiffrast.__version__,
-    author="Samuli Laine",
-    author_email="slaine@nvidia.com",
-    description="nvdiffrast - modular primitives for high-performance differentiable rendering",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/NVlabs/nvdiffrast",
-    packages=setuptools.find_packages(),
-    package_data={
-        'nvdiffrast': [
-            'common/*.h',
-            'common/*.inl',
-            'common/*.cu',
-            'common/*.cpp',
-            'common/cudaraster/*.hpp',
-            'common/cudaraster/impl/*.cpp',
-            'common/cudaraster/impl/*.hpp',
-            'common/cudaraster/impl/*.inl',
-            'common/cudaraster/impl/*.cu',
-            'lib/*.h',
-            'torch/*.h',
-            'torch/*.inl',
-            'torch/*.cpp',
-            'tensorflow/*.cu',
-        ] + (['lib/*.lib'] if os.name == 'nt' else [])
-    },
-    include_package_data=True,
-    install_requires=['numpy'],  # note: can't require torch here as it will install torch even for a TensorFlow container
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Operating System :: OS Independent",
-    ],
-    python_requires='>=3.6',
-)

ginipick:SORA-3D/extensions/extensions_nvdiffrast_setup.py DELETED Viewed

@@ -1,82 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-import nvdiffrast
-import setuptools
-import os
-from torch.utils.cpp_extension import CUDAExtension, BuildExtension
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-setuptools.setup(
-    name="nvdiffrast",
-    version=nvdiffrast.__version__,
-    author="Samuli Laine",
-    author_email="slaine@nvidia.com",
-    description="nvdiffrast - modular primitives for high-performance differentiable rendering",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/NVlabs/nvdiffrast",
-    packages=setuptools.find_packages(),
-    # package_data={
-    #     'nvdiffrast': [
-    #         'common/*.h',
-    #         'common/*.inl',
-    #         'common/*.cu',
-    #         'common/*.cpp',
-    #         'common/cudaraster/*.hpp',
-    #         'common/cudaraster/impl/*.cpp',
-    #         'common/cudaraster/impl/*.hpp',
-    #         'common/cudaraster/impl/*.inl',
-    #         'common/cudaraster/impl/*.cu',
-    #         'lib/*.h',
-    #         'torch/*.h',
-    #         'torch/*.inl',
-    #         'torch/*.cpp',
-    #         'tensorflow/*.cu',
-    #     ] + (['lib/*.lib'] if os.name == 'nt' else [])
-    # },
-    # include_package_data=True,
-    ext_modules=[
-        CUDAExtension(
-            name="nvdiffrast.torch._C",
-            sources=[
-                'nvdiffrast/common/cudaraster/impl/Buffer.cpp',
-                'nvdiffrast/common/cudaraster/impl/CudaRaster.cpp',
-                'nvdiffrast/common/cudaraster/impl/RasterImpl_.cu',
-                'nvdiffrast/common/cudaraster/impl/RasterImpl.cpp',
-                'nvdiffrast/common/common.cpp',
-                'nvdiffrast/common/rasterize.cu',
-                'nvdiffrast/common/interpolate.cu',
-                'nvdiffrast/common/texture_.cu',
-                'nvdiffrast/common/texture.cpp',
-                'nvdiffrast/common/antialias.cu',
-                'nvdiffrast/torch/torch_bindings.cpp',
-                'nvdiffrast/torch/torch_rasterize.cpp',
-                'nvdiffrast/torch/torch_interpolate.cpp',
-                'nvdiffrast/torch/torch_texture.cpp',
-                'nvdiffrast/torch/torch_antialias.cpp',
-            ],
-            extra_compile_args={
-                'cxx': ['-DNVDR_TORCH'],
-                'nvcc': ['-DNVDR_TORCH', '-lineinfo'],
-            },
-        )
-    ],
-    cmdclass={
-        'build_ext': BuildExtension
-    },
-    install_requires=['numpy'],  # note: can't require torch here as it will install torch even for a TensorFlow container
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Operating System :: OS Independent",
-    ],
-    python_requires='>=3.6',
-)

ginipick:SORA-3D/extensions/nvdiffrast/.DS_Store DELETED Viewed

Binary file (8.2 kB)

ginipick:SORA-3D/extensions/nvdiffrast/common/.DS_Store DELETED Viewed

Binary file (10.2 kB)

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/.DS_Store DELETED Viewed

Binary file (6.15 kB)

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/extensions_nvdiffrast_nvdiffrast_common_cudaraster_CudaRaster.hpp DELETED Viewed

@@ -1,63 +0,0 @@
-// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-#pragma once
-//------------------------------------------------------------------------
-// This is a slimmed-down and modernized version of the original
-// CudaRaster codebase that accompanied the HPG 2011 paper
-// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
-// Modifications have been made to accommodate post-Volta execution model
-// with warp divergence. Support for shading, blending, quad rendering,
-// and supersampling have been removed as unnecessary for nvdiffrast.
-//------------------------------------------------------------------------
-namespace CR
-{
-class RasterImpl;
-//------------------------------------------------------------------------
-// Interface class to isolate user from implementation details.
-//------------------------------------------------------------------------
-class CudaRaster
-{
-public:
-    enum
-    {
-        RenderModeFlag_EnableBackfaceCulling = 1 << 0,   // Enable backface culling.
-        RenderModeFlag_EnableDepthPeeling    = 1 << 1,   // Enable depth peeling. Must have a peel buffer set.
-    };
-public:
-					        CudaRaster				(void);
-					        ~CudaRaster				(void);
-    void                    setBufferSize           (int width, int height, int numImages);              // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
-    void                    setViewport             (int width, int height, int offsetX, int offsetY);   // Tiled rendering viewport setup.
-    void                    setRenderModeFlags      (unsigned int renderModeFlags);                      // Affects all subsequent calls to drawTriangles(). Defaults to zero.
-    void                    deferredClear           (unsigned int clearColor);                           // Clears color and depth buffers during next call to drawTriangles().
-    void                    setVertexBuffer         (void* vertices, int numVertices);                   // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
-    void                    setIndexBuffer          (void* indices, int numTriangles);                   // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
-    bool                    drawTriangles           (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
-    void*                   getColorBuffer          (void);                                              // GPU pointer managed by CudaRaster.
-    void*                   getDepthBuffer          (void);                                              // GPU pointer managed by CudaRaster.
-    void                    swapDepthAndPeel        (void);                                              // Swap depth and peeling buffers.
-private:
-					        CudaRaster           	(const CudaRaster&); // forbidden
-	CudaRaster&             operator=           	(const CudaRaster&); // forbidden
-private:
-    RasterImpl*             m_impl;                 // Opaque pointer to implementation.
-};
-//------------------------------------------------------------------------
-} // namespace CR

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/.DS_Store DELETED Viewed

Binary file (10.2 kB)

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_BinRaster.inl DELETED Viewed

@@ -1,423 +0,0 @@
-// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-//------------------------------------------------------------------------
-__device__ __inline__ void binRasterImpl(const CRParams p)
-{
-    __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
-    __shared__ volatile S32 s_outOfs    [CR_MAXBINS_SQR];
-    __shared__ volatile S32 s_outTotal  [CR_MAXBINS_SQR];
-    __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
-    __shared__ volatile S32 s_outMask   [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
-    __shared__ volatile S32 s_outCount  [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
-    __shared__ volatile S32 s_triBuf    [CR_BIN_WARPS*32*4];                // triangle ring buffer
-    __shared__ volatile U32 s_batchPos;
-    __shared__ volatile U32 s_bufCount;
-    __shared__ volatile U32 s_overTotal;
-    __shared__ volatile U32 s_allocBase;
-    const CRImageParams&    ip              = getImageParams(p, blockIdx.z);
-    CRAtomics&              atomics         = p.atomics[blockIdx.z];
-    const U8*               triSubtris      = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
-    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
-    S32*                    binFirstSeg     = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
-    S32*                    binTotal        = (S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
-    S32*                    binSegData      = (S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
-    S32*                    binSegNext      = (S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
-    S32*                    binSegCount     = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
-    if (atomics.numSubtris > p.maxSubtris)
-        return;
-    // per-thread state
-    int thrInBlock = threadIdx.x + threadIdx.y * 32;
-    int batchPos = 0;
-    // first 16 elements of s_broadcast are always zero
-    if (thrInBlock < 16)
-        s_broadcast[thrInBlock] = 0;
-    // initialize output linked lists and offsets
-    if (thrInBlock < p.numBins)
-    {
-        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
-        s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
-        s_outTotal[thrInBlock] = 0;
-    }
-    // repeat until done
-    for(;;)
-    {
-        // get batch
-        if (thrInBlock == 0)
-            s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
-        __syncthreads();
-        batchPos = s_batchPos;
-        // all batches done?
-        if (batchPos >= ip.triCount)
-            break;
-        // per-thread state
-        int bufIndex = 0;
-        int bufCount = 0;
-        int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
-        // loop over batch as long as we have triangles in it
-        do
-        {
-            // read more triangles
-            while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
-            {
-                // get subtriangle count
-                int triIdx = batchPos + thrInBlock;
-                int num = 0;
-                if (triIdx < batchEnd)
-                    num = triSubtris[triIdx];
-                // cumulative sum of subtriangles within each warp
-                U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
-                if (__any_sync(~0u, num > 1))
-                {
-                    myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
-                    myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
-                }
-                if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
-                    s_broadcast[threadIdx.y + 16] = myIdx + num;
-                __syncthreads();
-                // cumulative sum of per-warp subtriangle counts
-                // Note: cannot have more than 32 warps or this needs to sync between each step.
-                bool act = (thrInBlock < CR_BIN_WARPS);
-                U32 actMask = __ballot_sync(~0u, act);
-                if (threadIdx.y == 0 && act)
-                {
-                    volatile U32* ptr = &s_broadcast[thrInBlock + 16];
-                    U32 val = *ptr;
-                    #if (CR_BIN_WARPS > 1)
-                        val += ptr[-1]; __syncwarp(actMask);
-                        *ptr = val;     __syncwarp(actMask);
-                    #endif
-                    #if (CR_BIN_WARPS > 2)
-                        val += ptr[-2]; __syncwarp(actMask);
-                        *ptr = val;     __syncwarp(actMask);
-                    #endif
-                    #if (CR_BIN_WARPS > 4)
-                        val += ptr[-4]; __syncwarp(actMask);
-                        *ptr = val;     __syncwarp(actMask);
-                    #endif
-                    #if (CR_BIN_WARPS > 8)
-                        val += ptr[-8]; __syncwarp(actMask);
-                        *ptr = val;     __syncwarp(actMask);
-                    #endif
-                    #if (CR_BIN_WARPS > 16)
-                        val += ptr[-16]; __syncwarp(actMask);
-                        *ptr = val;      __syncwarp(actMask);
-                    #endif
-                    // initially assume that we consume everything
-                    // only last active thread does the writes
-                    if (threadIdx.x == CR_BIN_WARPS - 1)
-                    {
-                        s_batchPos = batchPos + CR_BIN_WARPS * 32;
-                        s_bufCount = bufCount + val;
-                    }
-                }
-                __syncthreads();
-                // skip if no subtriangles
-                if (num)
-                {
-                    // calculate write position for first subtriangle
-                    U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
-                    // only write if entire triangle fits
-                    if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
-                    {
-                        pos += bufIndex; // adjust for current start position
-                        pos &= CR_ARRAY_SIZE(s_triBuf)-1;
-                        if (num == 1)
-                            s_triBuf[pos] = triIdx * 8 + 7; // single triangle
-                        else
-                        {
-                            for (int i=0; i < num; i++)
-                            {
-                                s_triBuf[pos] = triIdx * 8 + i;
-                                pos++;
-                                pos &= CR_ARRAY_SIZE(s_triBuf)-1;
-                            }
-                        }
-                    } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
-                    {
-                        // this triangle is the first that failed, overwrite total count and triangle count
-                        s_batchPos = batchPos + thrInBlock;
-                        s_bufCount = pos;
-                    }
-                }
-                // update triangle counts
-                __syncthreads();
-                batchPos = s_batchPos;
-                bufCount = s_bufCount;
-            }
-            // make every warp clear its output buffers
-            for (int i=threadIdx.x; i < p.numBins; i += 32)
-                s_outMask[threadIdx.y][i] = 0;
-            __syncwarp();
-            // choose our triangle
-            uint4 triData = make_uint4(0, 0, 0, 0);
-            if (thrInBlock < bufCount)
-            {
-                U32 triPos = bufIndex + thrInBlock;
-                triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
-                // find triangle
-                int triIdx = s_triBuf[triPos];
-                int dataIdx = triIdx >> 3;
-                int subtriIdx = triIdx & 7;
-                if (subtriIdx != 7)
-                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
-                // read triangle
-                triData = *(((const uint4*)triHeader) + dataIdx);
-            }
-            // setup bounding box and edge functions, and rasterize
-            S32 lox, loy, hix, hiy;
-            bool hasTri = (thrInBlock < bufCount);
-            U32 hasTriMask = __ballot_sync(~0u, hasTri);
-            if (hasTri)
-            {
-                S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp  * (CR_SUBPIXEL_SIZE >> 1));
-                S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
-                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
-                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
-                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
-                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
-                int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
-                lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
-                loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
-                hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
-                hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
-                U32 bit = 1 << threadIdx.x;
-#if __CUDA_ARCH__ >= 700
-                bool multi = (hix != lox || hiy != loy);
-                if (!__any_sync(hasTriMask, multi))
-                {
-                    int binIdx = lox + p.widthBins * loy;
-                    U32 mask = __match_any_sync(hasTriMask, binIdx);
-                    s_outMask[threadIdx.y][binIdx] = mask;
-                    __syncwarp(hasTriMask);
-                } else
-#endif
-                {
-                    bool complex = (hix > lox+1 || hiy > loy+1);
-                    if (!__any_sync(hasTriMask, complex))
-                    {
-                        int binIdx = lox + p.widthBins * loy;
-                        atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
-                        if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
-                        if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
-                        if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
-                    } else
-                    {
-                        S32 d12x = d02x - d01x, d12y = d02y - d01y;
-                        v0x -= lox << binLog, v0y -= loy << binLog;
-                        S32 t01 = v0x * d01y - v0y * d01x;
-                        S32 t02 = v0y * d02x - v0x * d02y;
-                        S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
-                        S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
-                        S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
-                        S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
-                        int width = hix - lox + 1;
-                        d01x += width * d01y;
-                        d02x += width * d02y;
-                        d12x += width * d12y;
-                        U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
-                        U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
-                        U8* endPtr  = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
-                        int stride  = p.widthBins * 4;
-                        int ptrYInc = stride - width * 4;
-                        do
-                        {
-                            if (b01 >= 0 && b02 >= 0 && b12 >= 0)
-                                atomicOr((U32*)currPtr, bit);
-                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
-                            if (currPtr == skipPtr)
-                                currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
-                        }
-                        while (currPtr != endPtr);
-                    }
-                }
-            }
-            // count per-bin contributions
-            if (thrInBlock == 0)
-                s_overTotal = 0; // overflow counter
-            // ensure that out masks are done
-            __syncthreads();
-            int overIndex = -1;
-            bool act = (thrInBlock < p.numBins);
-            U32 actMask = __ballot_sync(~0u, act);
-            if (act)
-            {
-                U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
-                U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
-                int total = 0;
-                for (int i = 0; i < CR_BIN_WARPS; i++)
-                {
-                    total += __popc(*(U32*)srcPtr);
-                    *(U32*)dstPtr = total;
-                    srcPtr += (CR_MAXBINS_SQR + 1) * 4;
-                    dstPtr += (CR_MAXBINS_SQR + 1) * 4;
-                }
-                // overflow => request a new segment
-                int ofs = s_outOfs[thrInBlock];
-                bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
-                U32 ovrMask = __ballot_sync(actMask, ovr);
-                if (ovr)
-                {
-                    overIndex = __popc(ovrMask & getLaneMaskLt());
-                    if (overIndex == 0)
-                        s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
-                    __syncwarp(ovrMask);
-                    overIndex += s_broadcast[threadIdx.y + 16];
-                    s_overIndex[thrInBlock] = overIndex;
-                }
-            }
-            // sync after overTotal is ready
-            __syncthreads();
-            // at least one segment overflowed => allocate segments
-            U32 overTotal = s_overTotal;
-            U32 allocBase = 0;
-            if (overTotal > 0)
-            {
-                // allocate memory
-                if (thrInBlock == 0)
-                {
-                    U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
-                    s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
-                }
-                __syncthreads();
-                allocBase = s_allocBase;
-                // did my bin overflow?
-                if (overIndex != -1)
-                {
-                    // calculate new segment index
-                    int segIdx = allocBase + overIndex;
-                    // add to linked list
-                    if (s_outOfs[thrInBlock] < 0)
-                        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
-                    else
-                        binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
-                    // defaults
-                    binSegNext [segIdx] = -1;
-                    binSegCount[segIdx] = CR_BIN_SEG_SIZE;
-                }
-            }
-            // concurrent emission -- each warp handles its own triangle
-            if (thrInBlock < bufCount)
-            {
-                int triPos  = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
-                int currBin = lox + loy * p.widthBins;
-                int skipBin = (hix + 1) + loy * p.widthBins;
-                int endBin  = lox + (hiy + 1) * p.widthBins;
-                int binYInc = p.widthBins - (hix - lox + 1);
-                // loop over triangle's bins
-                do
-                {
-                    U32 outMask = s_outMask[threadIdx.y][currBin];
-                    if (outMask & (1<<threadIdx.x))
-                    {
-                        int idx = __popc(outMask & getLaneMaskLt());
-                        if (threadIdx.y > 0)
-                            idx += s_outCount[threadIdx.y-1][currBin];
-                        int base = s_outOfs[currBin];
-                        int free = (-base) & (CR_BIN_SEG_SIZE - 1);
-                        if (idx >= free)
-                            idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
-                        else
-                            idx += base;
-                        binSegData[idx] = s_triBuf[triPos];
-                    }
-                    currBin++;
-                    if (currBin == skipBin)
-                        currBin += binYInc, skipBin += p.widthBins;
-                }
-                while (currBin != endBin);
-            }
-            // wait all triangles to finish, then replace overflown segment offsets
-            __syncthreads();
-            if (thrInBlock < p.numBins)
-            {
-                U32 total  = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
-                U32 oldOfs = s_outOfs[thrInBlock];
-                if (overIndex == -1)
-                    s_outOfs[thrInBlock] = oldOfs + total;
-                else
-                {
-                    int addr = oldOfs + total;
-                    addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
-                    addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
-                    s_outOfs[thrInBlock] = addr;
-                }
-                s_outTotal[thrInBlock] += total;
-            }
-            // these triangles are now done
-            int count = ::min(bufCount, CR_BIN_WARPS * 32);
-            bufCount -= count;
-            bufIndex += count;
-            bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
-        }
-        while (bufCount > 0 || batchPos < batchEnd);
-        // flush all bins
-        if (thrInBlock < p.numBins)
-        {
-            int ofs = s_outOfs[thrInBlock];
-            if (ofs & (CR_BIN_SEG_SIZE-1))
-            {
-                int seg = ofs >> CR_BIN_SEG_LOG2;
-                binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
-                s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
-            }
-        }
-    }
-    // output totals
-    if (thrInBlock < p.numBins)
-        binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
-}
-//------------------------------------------------------------------------

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.cpp DELETED Viewed

@@ -1,94 +0,0 @@
-// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-#include "../../framework.h"
-#include "Buffer.hpp"
-using namespace CR;
-//------------------------------------------------------------------------
-// GPU buffer.
-//------------------------------------------------------------------------
-Buffer::Buffer(void)
-:   m_gpuPtr(NULL),
-    m_bytes (0)
-{
-    // empty
-}
-Buffer::~Buffer(void)
-{
-    if (m_gpuPtr)
-        cudaFree(m_gpuPtr); // Don't throw an exception.
-}
-void Buffer::reset(size_t bytes)
-{
-    if (bytes == m_bytes)
-        return;
-    if (m_gpuPtr)
-    {
-        NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
-        m_gpuPtr = NULL;
-    }
-    if (bytes > 0)
-        NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
-    m_bytes = bytes;
-}
-void Buffer::grow(size_t bytes)
-{
-    if (bytes > m_bytes)
-        reset(bytes);
-}
-//------------------------------------------------------------------------
-// Host buffer with page-locked memory.
-//------------------------------------------------------------------------
-HostBuffer::HostBuffer(void)
-:   m_hostPtr(NULL),
-    m_bytes  (0)
-{
-    // empty
-}
-HostBuffer::~HostBuffer(void)
-{
-    if (m_hostPtr)
-        cudaFreeHost(m_hostPtr); // Don't throw an exception.
-}
-void HostBuffer::reset(size_t bytes)
-{
-    if (bytes == m_bytes)
-        return;
-    if (m_hostPtr)
-    {
-        NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
-        m_hostPtr = NULL;
-    }
-    if (bytes > 0)
-        NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
-    m_bytes = bytes;
-}
-void HostBuffer::grow(size_t bytes)
-{
-    if (bytes > m_bytes)
-        reset(bytes);
-}
-//------------------------------------------------------------------------

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_Buffer.hpp DELETED Viewed

@@ -1,55 +0,0 @@
-// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-#pragma once
-#include "Defs.hpp"
-namespace CR
-{
-//------------------------------------------------------------------------
-class Buffer
-{
-public:
-                    Buffer      (void);
-                    ~Buffer     (void);
-    void            reset       (size_t bytes);
-    void            grow        (size_t bytes);
-    void*           getPtr      (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
-    size_t          getSize     (void) const { return m_bytes; }
-    void            setPtr      (void* ptr) { m_gpuPtr = ptr; }
-private:
-    void*           m_gpuPtr;
-    size_t          m_bytes;
-};
-//------------------------------------------------------------------------
-class HostBuffer
-{
-public:
-                    HostBuffer  (void);
-                    ~HostBuffer (void);
-    void            reset       (size_t bytes);
-    void            grow        (size_t bytes);
-    void*           getPtr      (void) { return m_hostPtr; }
-    size_t          getSize     (void) const { return m_bytes; }
-    void            setPtr      (void* ptr) { m_hostPtr = ptr; }
-private:
-    void*           m_hostPtr;
-    size_t          m_bytes;
-};
-//------------------------------------------------------------------------
-}

ginipick:SORA-3D/extensions/nvdiffrast/common/cudaraster/impl/extensions_nvdiffrast_nvdiffrast_common_cudaraster_impl_CoarseRaster.inl DELETED Viewed

@@ -1,730 +0,0 @@
-// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-//------------------------------------------------------------------------
-__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
-{
-    int tileX = tileInBin & (CR_BIN_SIZE - 1);
-    int tileY = tileInBin >> CR_BIN_LOG2;
-    return tileX + tileY * widthTiles;
-}
-//------------------------------------------------------------------------
-__device__ __inline__ void coarseRasterImpl(const CRParams p)
-{
-    // Common.
-    __shared__ volatile U32 s_workCounter;
-    __shared__ volatile U32 s_scanTemp          [CR_COARSE_WARPS][48];              // 3KB
-    // Input.
-    __shared__ volatile U32 s_binOrder          [CR_MAXBINS_SQR];                   // 1KB
-    __shared__ volatile S32 s_binStreamCurrSeg  [CR_BIN_STREAMS_SIZE];              // 0KB
-    __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE];              // 0KB
-    __shared__ volatile S32 s_triQueue          [CR_COARSE_QUEUE_SIZE];             // 4KB
-    __shared__ volatile S32 s_triQueueWritePos;
-    __shared__ volatile U32 s_binStreamSelectedOfs;
-    __shared__ volatile U32 s_binStreamSelectedSize;
-    // Output.
-    __shared__ volatile U32 s_warpEmitMask      [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
-    __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
-    __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
-    __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
-    __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR];                       // 1KB
-    __shared__ volatile U32 s_firstAllocSeg;
-    __shared__ volatile U32 s_firstActiveIdx;
-    // Pointers and constants.
-    CRAtomics&              atomics         = p.atomics[blockIdx.z];
-    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
-    const S32*              binFirstSeg     = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
-    const S32*              binTotal        = (const S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
-    const S32*              binSegData      = (const S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
-    const S32*              binSegNext      = (const S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
-    const S32*              binSegCount     = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
-    S32*                    activeTiles     = (S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
-    S32*                    tileFirstSeg    = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
-    S32*                    tileSegData     = (S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
-    S32*                    tileSegNext     = (S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
-    S32*                    tileSegCount    = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
-    int tileLog     = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
-    int thrInBlock  = threadIdx.x + threadIdx.y * 32;
-    int emitShift   = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
-    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
-        return;
-    // Initialize sharedmem arrays.
-    if (thrInBlock == 0)
-    {
-        s_tileEmitPrefixSum[0] = 0;
-        s_tileAllocPrefixSum[0] = 0;
-    }
-    s_scanTemp[threadIdx.y][threadIdx.x] = 0;
-    // Sort bins in descending order of triangle count.
-    for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
-    {
-        int count = 0;
-        for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
-            count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
-        s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
-    }
-    __syncthreads();
-    sortShared(s_binOrder, p.numBins);
-    // Process each bin by one block.
-    for (;;)
-    {
-        // Pick a bin for the block.
-        if (thrInBlock == 0)
-            s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
-        __syncthreads();
-        int workCounter = s_workCounter;
-        if (workCounter >= p.numBins)
-            break;
-        U32 binOrder = s_binOrder[workCounter];
-        bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
-        if (binEmpty && !p.deferredClear)
-            break;
-        int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
-        // Initialize input/output streams.
-        int triQueueWritePos = 0;
-        int triQueueReadPos = 0;
-        if (thrInBlock < CR_BIN_STREAMS_SIZE)
-        {
-            int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
-            s_binStreamCurrSeg[thrInBlock] = segIdx;
-            s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
-        }
-        for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
-            s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
-        // Initialize per-bin state.
-        int binY = idiv_fast(binIdx, p.widthBins);
-        int binX = binIdx - binY * p.widthBins;
-        int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
-        int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
-        int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
-        int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
-        int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
-        // Entire block: Merge input streams and process triangles.
-        if (!binEmpty)
-        do
-        {
-            //------------------------------------------------------------------------
-            // Merge.
-            //------------------------------------------------------------------------
-            // Entire block: Not enough triangles => merge and queue segments.
-            // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
-            while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
-            {
-                // First warp: Choose the segment with the lowest initial triangle index.
-                bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
-                U32 hasStreamMask = __ballot_sync(~0u, hasStream);
-                if (hasStream)
-                {
-                    // Find the stream with the lowest triangle index.
-                    U32 firstTri = s_binStreamFirstTri[thrInBlock];
-                    U32 t = firstTri;
-                    volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
-                    #if (CR_BIN_STREAMS_SIZE > 1)
-                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
-                    #endif
-                    #if (CR_BIN_STREAMS_SIZE > 2)
-                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
-                    #endif
-                    #if (CR_BIN_STREAMS_SIZE > 4)
-                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
-                    #endif
-                    #if (CR_BIN_STREAMS_SIZE > 8)
-                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
-                    #endif
-                    #if (CR_BIN_STREAMS_SIZE > 16)
-                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
-                    #endif
-                    v[0] = t; __syncwarp(hasStreamMask);
-                    // Consume and broadcast.
-                    bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
-                    U32 firstMask = __ballot_sync(hasStreamMask, first);
-                    if (first && (firstMask >> threadIdx.x) == 1u)
-                    {
-                        int segIdx = s_binStreamCurrSeg[thrInBlock];
-                        s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
-                        if (segIdx != -1)
-                        {
-                            int segSize = binSegCount[segIdx];
-                            int segNext = binSegNext[segIdx];
-                            s_binStreamSelectedSize = segSize;
-                            s_triQueueWritePos = triQueueWritePos + segSize;
-                            s_binStreamCurrSeg[thrInBlock] = segNext;
-                            s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
-                        }
-                    }
-                }
-                // No more segments => break.
-                __syncthreads();
-                triQueueWritePos = s_triQueueWritePos;
-                int segOfs = s_binStreamSelectedOfs;
-                if (segOfs < 0)
-                    break;
-                int segSize = s_binStreamSelectedSize;
-                __syncthreads();
-                // Fetch triangles into the queue.
-                for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
-                {
-                    S32 triIdx = binSegData[segOfs + idxInSeg];
-                    s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
-                }
-            }
-            // All threads: Clear emit masks.
-            for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
-                s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
-            __syncthreads();
-            //------------------------------------------------------------------------
-            // Raster.
-            //------------------------------------------------------------------------
-            // Triangle per thread: Read from the queue.
-            int triIdx = -1;
-            if (triQueueReadPos + thrInBlock < triQueueWritePos)
-                triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
-            uint4 triData = make_uint4(0, 0, 0, 0);
-            if (triIdx != -1)
-            {
-                int dataIdx = triIdx >> 3;
-                int subtriIdx = triIdx & 7;
-                if (subtriIdx != 7)
-                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
-                triData = *((uint4*)triHeader + dataIdx);
-            }
-            // 32 triangles per warp: Record emits (= tile intersections).
-            if (__any_sync(~0u, triIdx != -1))
-            {
-                S32 v0x = sub_s16lo_s16lo(triData.x, originX);
-                S32 v0y = sub_s16hi_s16lo(triData.x, originY);
-                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
-                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
-                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
-                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
-                // Compute tile-based AABB.
-                int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
-                int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
-                int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
-                int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
-                int sizex = add_sub(hix, 1, lox);
-                int sizey = add_sub(hiy, 1, loy);
-                int area = sizex * sizey;
-                // Miscellaneous init.
-                U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
-                int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
-                U32 maskBit = 1 << threadIdx.x;
-                // Case A: All AABBs are small => record the full AABB using atomics.
-                if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
-                {
-                    if (triIdx != -1)
-                    {
-                        atomicOr((U32*)currPtr, maskBit);
-                        if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
-                        if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
-                        if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
-                    }
-                }
-                else
-                {
-                    // Compute warp-AABB (scan-32).
-                    U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
-                    if (triIdx == -1)
-                        aabbMask = 0;
-                    volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
-                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
-                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
-                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
-                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
-                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
-                    v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
-                    U32 maskX = aabbMask & 0xFFFF;
-                    U32 maskY = aabbMask >> 16;
-                    int wlox = findLeadingOne(maskX ^ (maskX - 1));
-                    int wloy = findLeadingOne(maskY ^ (maskY - 1));
-                    int whix = findLeadingOne(maskX);
-                    int whiy = findLeadingOne(maskY);
-                    int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
-                    // Initialize edge functions.
-                    S32 d12x = d02x - d01x;
-                    S32 d12y = d02y - d01y;
-                    v0x -= lox << tileLog;
-                    v0y -= loy << tileLog;
-                    S32 t01 = v0x * d01y - v0y * d01x;
-                    S32 t02 = v0y * d02x - v0x * d02y;
-                    S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
-                    S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
-                    S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
-                    S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
-                    d01x += sizex * d01y;
-                    d02x += sizex * d02y;
-                    d12x += sizex * d12y;
-                    // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
-                    if (__any_sync(~0u, warea * 4 <= area * 8))
-                    {
-                        // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
-                        bool act = (triIdx != -1);
-                        U32 actMask = __ballot_sync(~0u, act);
-                        if (act)
-                        {
-                            for (int y = wloy; y <= whiy; y++)
-                            {
-                                bool yIn = (y >= loy && y <= hiy);
-                                U32 yMask = __ballot_sync(actMask, yIn);
-                                if (yIn)
-                                {
-                                    for (int x = wlox; x <= whix; x++)
-                                    {
-                                        bool xyIn = (x >= lox && x <= hix);
-                                        U32 xyMask = __ballot_sync(yMask, xyIn);
-                                        if (xyIn)
-                                        {
-                                            U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
-                                            if (threadIdx.x == 31 - __clz(xyMask))
-                                                *(U32*)currPtr = res;
-                                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
-                                        }
-                                    }
-                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
-                                }
-                            }
-                        }
-                    }
-                    // Case C: General case => Check tiles in AABB, record using atomics.
-                    else
-                    {
-                        if (triIdx != -1)
-                        {
-                            U8* skipPtr = currPtr + (sizex << 2);
-                            U8* endPtr  = currPtr + (sizey << (CR_BIN_LOG2 + 2));
-                            do
-                            {
-                                if (b01 >= 0 && b02 >= 0 && b12 >= 0)
-                                    atomicOr((U32*)currPtr, maskBit);
-                                currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
-                                if (currPtr == skipPtr)
-                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
-                            }
-                            while (currPtr != endPtr);
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-            //------------------------------------------------------------------------
-            // Count.
-            //------------------------------------------------------------------------
-            // Tile per thread: Initialize prefix sums.
-            for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
-            {
-                int tileInBin = tileInBin_base + thrInBlock;
-                bool act = (tileInBin < CR_BIN_SQR);
-                U32 actMask = __ballot_sync(~0u, act);
-                if (act)
-                {
-                    // Compute prefix sum of emits over warps.
-                    U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
-                    U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
-                    int tileEmits = 0;
-                    for (int i = 0; i < CR_COARSE_WARPS; i++)
-                    {
-                        tileEmits += __popc(*(U32*)srcPtr);
-                        *(U32*)dstPtr = tileEmits;
-                        srcPtr += (CR_BIN_SQR + 1) * 4;
-                        dstPtr += (CR_BIN_SQR + 1) * 4;
-                    }
-                    // Determine the number of segments to allocate.
-                    int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
-                    int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
-                    volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
-                    // All counters within the warp are small => compute prefix sum using ballot.
-                    if (!__any_sync(actMask, tileEmits >= 2))
-                    {
-                        U32 m = getLaneMaskLe();
-                        *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
-                    }
-                    // Otherwise => scan-32 within the warp.
-                    else
-                    {
-                        U32 sum = (tileEmits << emitShift) | tileAllocs;
-                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1)  sum += v[-1]; __syncwarp(actMask);
-                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2)  sum += v[-2]; __syncwarp(actMask);
-                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4)  sum += v[-4]; __syncwarp(actMask);
-                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8)  sum += v[-8]; __syncwarp(actMask);
-                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
-                        *v = sum; __syncwarp(actMask);
-                    }
-                }
-            }
-            // First warp: Scan-8.
-            __syncthreads();
-            bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
-            U32 scan8Mask = __ballot_sync(~0u, scan8);
-            if (scan8)
-            {
-                int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
-                volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
-                v[0] = sum; __syncwarp(scan8Mask);
-                #if (CR_BIN_SQR > 1 * 32)
-                    sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
-                #endif
-                #if (CR_BIN_SQR > 2 * 32)
-                    sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
-                #endif
-                #if (CR_BIN_SQR > 4 * 32)
-                    sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
-                #endif
-            }
-            __syncthreads();
-            // Tile per thread: Finalize prefix sums.
-            // Single thread: Allocate segments.
-            for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
-            {
-                int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
-                int numEmits = sum >> emitShift;
-                int numAllocs = sum & ((1 << emitShift) - 1);
-                s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
-                s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
-                if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
-                {
-                    int t = atomicAdd(&atomics.numTileSegs, numAllocs);
-                    s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
-                }
-            }
-            __syncthreads();
-            int firstAllocSeg   = s_firstAllocSeg;
-            int totalEmits      = s_tileEmitPrefixSum[CR_BIN_SQR];
-            int totalAllocs     = s_tileAllocPrefixSum[CR_BIN_SQR];
-            //------------------------------------------------------------------------
-            // Emit.
-            //------------------------------------------------------------------------
-            // Emit per thread: Write triangle index to globalmem.
-            for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
-            {
-                // Find tile in bin.
-                U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
-                U8* tilePtr = tileBase;
-                U8* ptr;
-                #if (CR_BIN_SQR > 128)
-                    ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 64)
-                    ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 32)
-                    ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 16)
-                    ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 8)
-                    ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 4)
-                    ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 2)
-                    ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                #if (CR_BIN_SQR > 1)
-                    ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
-                #endif
-                int tileInBin = (tilePtr - tileBase) >> 2;
-                int emitInTile = emitInBin - *(U32*)tilePtr;
-                // Find warp in tile.
-                int warpStep = (CR_BIN_SQR + 1) * 4;
-                U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
-                U8* warpPtr = warpBase;
-                #if (CR_COARSE_WARPS > 8)
-                    ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
-                #endif
-                #if (CR_COARSE_WARPS > 4)
-                    ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
-                #endif
-                #if (CR_COARSE_WARPS > 2)
-                    ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
-                #endif
-                #if (CR_COARSE_WARPS > 1)
-                    ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
-                #endif
-                int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
-                U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
-                int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
-                // Find thread in warp.
-                int threadInWarp = 0;
-                int pop = __popc(emitMask & 0xFFFF);
-                bool pred = (emitInWarp >= pop);
-                if (pred) emitInWarp -= pop;
-                if (pred) emitMask >>= 0x10;
-                if (pred) threadInWarp += 0x10;
-                pop = __popc(emitMask & 0xFF);
-                pred = (emitInWarp >= pop);
-                if (pred) emitInWarp -= pop;
-                if (pred) emitMask >>= 0x08;
-                if (pred) threadInWarp += 0x08;
-                pop = __popc(emitMask & 0xF);
-                pred = (emitInWarp >= pop);
-                if (pred) emitInWarp -= pop;
-                if (pred) emitMask >>= 0x04;
-                if (pred) threadInWarp += 0x04;
-                pop = __popc(emitMask & 0x3);
-                pred = (emitInWarp >= pop);
-                if (pred) emitInWarp -= pop;
-                if (pred) emitMask >>= 0x02;
-                if (pred) threadInWarp += 0x02;
-                if (emitInWarp >= (emitMask & 1))
-                    threadInWarp++;
-                // Figure out where to write.
-                int currOfs = s_tileStreamCurrOfs[tileInBin];
-                int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
-                int outOfs = emitInTile;
-                if (outOfs < spaceLeft)
-                    outOfs += currOfs;
-                else
-                {
-                    int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
-                    outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
-                }
-                // Write.
-                int queueIdx = warpInTile * 32 + threadInWarp;
-                int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
-                tileSegData[outOfs] = triIdx;
-            }
-            //------------------------------------------------------------------------
-            // Patch.
-            //------------------------------------------------------------------------
-            // Allocated segment per thread: Initialize next-pointer and count.
-            for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
-            {
-                int segIdx = firstAllocSeg + i;
-                tileSegNext[segIdx] = segIdx + 1;
-                tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
-            }
-            // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
-            __syncthreads();
-            for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
-            {
-                int oldOfs = s_tileStreamCurrOfs[tileInBin];
-                int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
-                int allocLo = s_tileAllocPrefixSum[tileInBin];
-                int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
-                if (allocLo != allocHi)
-                {
-                    S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
-                    if (oldOfs < 0)
-                        nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
-                    *nextPtr = firstAllocSeg + allocLo;
-                    newOfs--;
-                    newOfs &= CR_TILE_SEG_SIZE - 1;
-                    newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
-                    newOfs++;
-                }
-                s_tileStreamCurrOfs[tileInBin] = newOfs;
-            }
-            // Advance queue read pointer.
-            // Queue became empty => bin done.
-            triQueueReadPos += CR_COARSE_WARPS * 32;
-        }
-        while (triQueueReadPos < triQueueWritePos);
-        // Tile per thread: Fix next-pointer and count of the last segment.
-        // 32 tiles per warp: Count active tiles.
-        __syncthreads();
-        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
-        {
-            int tileInBin = tileInBin_base + thrInBlock;
-            bool act = (tileInBin < CR_BIN_SQR);
-            U32 actMask = __ballot_sync(~0u, act);
-            if (act)
-            {
-                int tileX = tileInBin & (CR_BIN_SIZE - 1);
-                int tileY = tileInBin >> CR_BIN_LOG2;
-                bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
-                int ofs = s_tileStreamCurrOfs[tileInBin];
-                int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
-                int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
-                if (ofs >= 0)
-                    tileSegNext[segIdx] = -1;
-                else if (force)
-                {
-                    s_tileStreamCurrOfs[tileInBin] = 0;
-                    tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
-                }
-                if (segCount != 0)
-                    tileSegCount[segIdx] = segCount;
-                U32 res = __ballot_sync(actMask, ofs >= 0 | force);
-                if (threadIdx.x == 0)
-                    s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
-            }
-        }
-        // First warp: Scan-8.
-        // One thread: Allocate space for active tiles.
-        __syncthreads();
-        bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
-        U32 scan8Mask = __ballot_sync(~0u, scan8);
-        if (scan8)
-        {
-            volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
-            U32 sum = v[0];
-            #if (CR_BIN_SQR > 1 * 32)
-                sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
-            #endif
-            #if (CR_BIN_SQR > 2 * 32)
-                sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
-            #endif
-            #if (CR_BIN_SQR > 4 * 32)
-                sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
-            #endif
-            if (thrInBlock == CR_BIN_SQR / 32 - 1)
-                s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
-        }
-        // Tile per thread: Output active tiles.
-        __syncthreads();
-        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
-        {
-            int tileInBin = tileInBin_base + thrInBlock;
-            bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
-            U32 actMask = __ballot_sync(~0u, act);
-            if (act)
-            {
-                int activeIdx = s_firstActiveIdx;
-                activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
-                activeIdx += __popc(actMask & getLaneMaskLt());
-                activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
-            }
-        }
-    }
-}
-//------------------------------------------------------------------------