Theo Viel commited on about 1 month ago

Commit

98a67a0

1 Parent(s): 694c514

add weights and code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

checkpoints/charset.txt +1 -0
checkpoints/detector.pth +3 -0
checkpoints/recognizer.pth +3 -0
checkpoints/relational.pth +3 -0
example.py +43 -0
nemo-retriever-ocr/cpp/.gitattributes +1 -0
nemo-retriever-ocr/cpp/.gitignore +6 -0
nemo-retriever-ocr/cpp/.gitmodules +3 -0
nemo-retriever-ocr/cpp/README.md +15 -0
nemo-retriever-ocr/cpp/beam_decode/beam_decode.cpp +460 -0
nemo-retriever-ocr/cpp/beam_decode/beam_decode.h +18 -0
nemo-retriever-ocr/cpp/beam_decode/kn_lm.cpp +86 -0
nemo-retriever-ocr/cpp/beam_decode/kn_lm.h +27 -0
nemo-retriever-ocr/cpp/beam_decode/language_model.cpp +147 -0
nemo-retriever-ocr/cpp/beam_decode/language_model.h +66 -0
nemo-retriever-ocr/cpp/beam_decode/log_sum_exp.cpp +7 -0
nemo-retriever-ocr/cpp/beam_decode/log_sum_exp.h +54 -0
nemo-retriever-ocr/cpp/beam_decode/ngram_lm_base.cpp +330 -0
nemo-retriever-ocr/cpp/beam_decode/ngram_lm_base.h +80 -0
nemo-retriever-ocr/cpp/beam_decode/prefix.cpp +23 -0
nemo-retriever-ocr/cpp/beam_decode/prefix.h +158 -0
nemo-retriever-ocr/cpp/beam_decode/sbo_lm.cpp +47 -0
nemo-retriever-ocr/cpp/beam_decode/sbo_lm.h +21 -0
nemo-retriever-ocr/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp +94 -0
nemo-retriever-ocr/cpp/better_grid_sample/gpu_grid_sample_utils.cuh +42 -0
nemo-retriever-ocr/cpp/better_grid_sample/gpu_indirect_grid_sample.cu +328 -0
nemo-retriever-ocr/cpp/better_grid_sample/grid_sample.h +67 -0
nemo-retriever-ocr/cpp/common.cpp +13 -0
nemo-retriever-ocr/cpp/common.h +58 -0
nemo-retriever-ocr/cpp/cuda_intellisense.cuh +51 -0
nemo-retriever-ocr/cpp/geometry.h +1101 -0
nemo-retriever-ocr/cpp/geometry_api/calc_poly_min_rrect.cpp +165 -0
nemo-retriever-ocr/cpp/geometry_api/geometry_api.cpp +101 -0
nemo-retriever-ocr/cpp/geometry_api/geometry_api.h +16 -0
nemo-retriever-ocr/cpp/geometry_api/geometry_api_common.h +121 -0
nemo-retriever-ocr/cpp/geometry_api/geometry_api_gpu.cu +142 -0
nemo-retriever-ocr/cpp/geometry_api/get_rel_continuation_cos.cpp +60 -0
nemo-retriever-ocr/cpp/geometry_api/matrix2x2.h +93 -0
nemo-retriever-ocr/cpp/geometry_api/poly_bounds_quad.cpp +61 -0
nemo-retriever-ocr/cpp/graph_detection/encode_util.cpp +272 -0
nemo-retriever-ocr/cpp/graph_detection/encode_util.h +184 -0
nemo-retriever-ocr/cpp/half_ops.cu +5 -0
nemo-retriever-ocr/cpp/half_ops.cuh +149 -0
nemo-retriever-ocr/cpp/local_ips/local_ips.h +11 -0
nemo-retriever-ocr/cpp/local_ips/quad_all_2_all_dist_v2.cu +162 -0
nemo-retriever-ocr/cpp/module.cpp +125 -0
nemo-retriever-ocr/cpp/non_maximal_suppression/cpu_non_maximal_suppression.cpp +209 -0
nemo-retriever-ocr/cpp/non_maximal_suppression/cuda_non_maximal_suppression.cu +1720 -0
nemo-retriever-ocr/cpp/non_maximal_suppression/nms_common.h +227 -0
nemo-retriever-ocr/cpp/non_maximal_suppression/nms_kd_tree.h +449 -0

checkpoints/charset.txt ADDED Viewed

	@@ -0,0 +1 @@

+ [" ", "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E", "F", "FI", "G", "H", "I", "I\u0307", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "SS", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\", "]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "fi", "g", "h", "i", "i\u0307", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "ss", "t", "u", "v", "w", "x", "y", "z", "{", "|", "}", "~", "\u00b2", "\u00b3", "\u00b5", "\u00b9", "\u00ba", "\u00c0", "\u00c1", "\u00c2", "\u00c3", "\u00c4", "\u00c5", "\u00c6", "\u00c7", "\u00c8", "\u00c9", "\u00ca", "\u00cb", "\u00cc", "\u00cd", "\u00ce", "\u00cf", "\u00d0", "\u00d1", "\u00d2", "\u00d3", "\u00d4", "\u00d5", "\u00d6", "\u00d8", "\u00d9", "\u00da", "\u00db", "\u00dc", "\u00dd", "\u00de", "\u00df", "\u00e0", "\u00e1", "\u00e2", "\u00e3", "\u00e4", "\u00e5", "\u00e6", "\u00e7", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ec", "\u00ed", "\u00ee", "\u00ef", "\u00f0", "\u00f1", "\u00f2", "\u00f3", "\u00f4", "\u00f5", "\u00f6", "\u00f8", "\u00f9", "\u00fa", "\u00fb", "\u00fc", "\u00fd", "\u00fe", "\u00ff", "\u0100", "\u0101", "\u0102", "\u0103", "\u0104", "\u0105", "\u0106", "\u0107", "\u010c", "\u010d", "\u010e", "\u010f", "\u0110", "\u0111", "\u0112", "\u0113", "\u0116", "\u0117", "\u0118", "\u0119", "\u011a", "\u011b", "\u011e", "\u011f", "\u0120", "\u0121", "\u0126", "\u0127", "\u0128", "\u0129", "\u012a", "\u012b", "\u0130", "\u0131", "\u0136", "\u0137", "\u013d", "\u013e", "\u0141", "\u0142", "\u0143", "\u0144", "\u0145", "\u0146", "\u0147", "\u0148", "\u014a", "\u014b", "\u014c", "\u014d", "\u014e", "\u014f", "\u0150", "\u0151", "\u0152", "\u0153", "\u0158", "\u0159", "\u015a", "\u015b", "\u015e", "\u015f", "\u0160", "\u0161", "\u0162", "\u0163", "\u0164", "\u0165", "\u0168", "\u0169", "\u016a", "\u016b", "\u016c", "\u016d", "\u016e", "\u016f", "\u0172", "\u0173", "\u0174", "\u0175", "\u0176", "\u0177", "\u0178", "\u0179", "\u017a", "\u017b", "\u017c", "\u017d", "\u017e", "\u0181", "\u0186", "\u0189", "\u018a", "\u018f", "\u0190", "\u0191", "\u0192", "\u0194", "\u0197", "\u019c", "\u019d", "\u019f", "\u01a0", "\u01a1", "\u01a6", "\u01a9", "\u01ae", "\u01af", "\u01b0", "\u01b1", "\u01b2", "\u01b7", "\u01c2", "\u01cd", "\u01ce", "\u01cf", "\u01d0", "\u01d1", "\u01d2", "\u01d3", "\u01d4", "\u01ea", "\u01eb", "\u0218", "\u0219", "\u021a", "\u021b", "\u0245", "\u0250", "\u0251", "\u0252", "\u0253", "\u0254", "\u0255", "\u0256", "\u0257", "\u0259", "\u025b", "\u025f", "\u0261", "\u0262", "\u0263", "\u0266", "\u0267", "\u0268", "\u026a", "\u026c", "\u026f", "\u0272", "\u0274", "\u0275", "\u0278", "\u027b", "\u027e", "\u0280", "\u0281", "\u0282", "\u0283", "\u0287", "\u0288", "\u028a", "\u028b", "\u028c", "\u028d", "\u028e", "\u0292", "\u0294", "\u0295", "\u0298", "\u029d", "\u029f", "\u02b0", "\u02b2", "\u02b7", "\u02bb", "\u02bc", "\u02be", "\u02bf", "\u02c0", "\u02c1", "\u02c8", "\u02cc", "\u02d0", "\u02e0", "\u02e4", "\u0386", "\u0388", "\u038a", "\u038c", "\u038e", "\u038f", "\u0391", "\u0391\u0342", "\u0392", "\u0393", "\u0394", "\u0395", "\u0396", "\u0397", "\u0397\u0342", "\u0398", "\u0399", "\u0399\u0342", "\u039a", "\u039b", "\u039c", "\u039d", "\u039e", "\u039f", "\u03a0", "\u03a1", "\u03a3", "\u03a4", "\u03a5", "\u03a5\u0313", "\u03a5\u0342", "\u03a6", "\u03a7", "\u03a8", "\u03a9", "\u03a9\u0342", "\u03a9\u0342\u0399", "\u03ac", "\u03ad", "\u03af", "\u03b1", "\u03b1\u0342", "\u03b2", "\u03b3", "\u03b4", "\u03b5", "\u03b6", "\u03b7", "\u03b7\u0342", "\u03b8", "\u03b9", "\u03b9\u0342", "\u03ba", "\u03bb", "\u03bc", "\u03bd", "\u03be", "\u03bf", "\u03c0", "\u03c1", "\u03c2", "\u03c3", "\u03c4", "\u03c5", "\u03c5\u0313", "\u03c5\u0342", "\u03c6", "\u03c7", "\u03c8", "\u03c9", "\u03c9\u0342", "\u03c9\u0342\u03b9", "\u03cc", "\u03cd", "\u03ce", "\u03d5", "\u0401", "\u0406", "\u0408", "\u0410", "\u0411", "\u0412", "\u0413", "\u0414", "\u0415", "\u0416", "\u0417", "\u0418", "\u0419", "\u041a", "\u041b", "\u041c", "\u041d", "\u041e", "\u041f", "\u0420", "\u0421", "\u0422", "\u0423", "\u0425", "\u0426", "\u0427", "\u0428", "\u042a", "\u042b", "\u042c", "\u042d", "\u042e", "\u042f", "\u0430", "\u0431", "\u0432", "\u0433", "\u0434", "\u0435", "\u0436", "\u0437", "\u0438", "\u0439", "\u043a", "\u043b", "\u043c", "\u043d", "\u043e", "\u043f", "\u0440", "\u0441", "\u0442", "\u0443", "\u0445", "\u0446", "\u0447", "\u0448", "\u044a", "\u044b", "\u044c", "\u044d", "\u044e", "\u044f", "\u0451", "\u0456", "\u0458", "\u05b5", "\u05b6", "\u05bc", "\u05d0", "\u05d1", "\u05d2", "\u05d3", "\u05d5", "\u05d7", "\u05d9", "\u05dc", "\u05dd", "\u05de", "\u05e0", "\u05e1", "\u05e2", "\u05e6", "\u05e8", "\u05e9", "\u05ea", "\u0621", "\u0623", "\u0625", "\u0627", "\u0628", "\u0629", "\u062a", "\u062c", "\u062d", "\u062e", "\u062f", "\u0631", "\u0632", "\u0633", "\u0634", "\u0635", "\u0637", "\u0639", "\u063a", "\u0641", "\u0642", "\u0643", "\u0644", "\u0645", "\u0646", "\u0647", "\u0648", "\u064a", "\u06cc", "\u0902", "\u0905", "\u0906", "\u0909", "\u0915", "\u0917", "\u091f", "\u0921", "\u0924", "\u0926", "\u0928", "\u092a", "\u092c", "\u092d", "\u092e", "\u092f", "\u0930", "\u0932", "\u0936", "\u0937", "\u0938", "\u0939", "\u093e", "\u093f", "\u0940", "\u0947", "\u094b", "\u0995", "\u09a4", "\u09b2", "\u09be", "\u09bf", "\u0b95", "\u0ba9", "\u0bb3", "\u0e02", "\u0e07", "\u0e08", "\u0e0a", "\u0e10", "\u0e15", "\u0e17", "\u0e19", "\u0e1b", "\u0e1e", "\u0e23", "\u0e27", "\u0e30", "\u0e31", "\u0e32", "\u0e40", "\u0e41", "\u16c3", "\u16cb", "\u16df", "\u1e0c", "\u1e0d", "\u1e24", "\u1e25", "\u1e36", "\u1e37", "\u1e3a", "\u1e3b", "\u1e42", "\u1e43", "\u1e44", "\u1e45", "\u1e46", "\u1e47", "\u1e48", "\u1e49", "\u1e5a", "\u1e5b", "\u1e5e", "\u1e5f", "\u1e62", "\u1e63", "\u1e6c", "\u1e6d", "\u1e6e", "\u1e6f", "\u1ea0", "\u1ea1", "\u1ea2", "\u1ea3", "\u1ea4", "\u1ea5", "\u1ea6", "\u1ea7", "\u1ea8", "\u1ea9", "\u1eaa", "\u1eab", "\u1eac", "\u1ead", "\u1eae", "\u1eaf", "\u1eb4", "\u1eb5", "\u1eb6", "\u1eb7", "\u1eb8", "\u1eb9", "\u1ebe", "\u1ebf", "\u1ec2", "\u1ec3", "\u1ec4", "\u1ec5", "\u1ec6", "\u1ec7", "\u1eca", "\u1ecb", "\u1ecc", "\u1ecd", "\u1ece", "\u1ecf", "\u1ed0", "\u1ed1", "\u1ed2", "\u1ed3", "\u1ed4", "\u1ed5", "\u1ed6", "\u1ed7", "\u1ed8", "\u1ed9", "\u1eda", "\u1edb", "\u1edc", "\u1edd", "\u1ede", "\u1edf", "\u1ee2", "\u1ee3", "\u1ee4", "\u1ee5", "\u1ee6", "\u1ee7", "\u1ee8", "\u1ee9", "\u1eea", "\u1eeb", "\u1eec", "\u1eed", "\u1eee", "\u1eef", "\u1ef0", "\u1ef1", "\u1ef2", "\u1ef3", "\u1ef4", "\u1ef5", "\u1ef8", "\u1ef9", "\u1f00", "\u1f04", "\u1f08", "\u1f0c", "\u1f10", "\u1f15", "\u1f18", "\u1f1d", "\u1f20", "\u1f21", "\u1f28", "\u1f29", "\u1f30", "\u1f31", "\u1f38", "\u1f39", "\u1f41", "\u1f44", "\u1f49", "\u1f4c", "\u1f50", "\u1f51", "\u1f59", "\u1f61", "\u1f69", "\u1f70", "\u1f72", "\u1f74", "\u1f76", "\u1f78", "\u1f7a", "\u1f7c", "\u1fb6", "\u1fba", "\u1fc6", "\u1fc8", "\u1fca", "\u1fd6", "\u1fda", "\u1fe6", "\u1fea", "\u1ff6", "\u1ff7", "\u1ff8", "\u1ffa", "\u2081", "\u2082", "\u2083", "\u2113", "\u2460", "\u2461", "\u2463", "\u2c6d", "\u2c6f", "\u2c70", "\u3044", "\u3045", "\u3046", "\u304a", "\u304b", "\u304d", "\u304f", "\u3050", "\u3053", "\u3057", "\u3059", "\u305b", "\u305f", "\u3064", "\u3069", "\u306e", "\u3070", "\u307d", "\u3088", "\u3089", "\u3093", "\u30a1", "\u30a2", "\u30a3", "\u30a4", "\u30a6", "\u30a7", "\u30a8", "\u30a9", "\u30aa", "\u30ab", "\u30ac", "\u30af", "\u30b0", "\u30b3", "\u30b4", "\u30b5", "\u30b6", "\u30b7", "\u30b8", "\u30b9", "\u30ba", "\u30bb", "\u30bc", "\u30bd", "\u30bf", "\u30c1", "\u30c3", "\u30c4", "\u30c6", "\u30c7", "\u30c8", "\u30c9", "\u30ca", "\u30cb", "\u30ce", "\u30cf", "\u30d0", "\u30d1", "\u30d2", "\u30d3", "\u30d5", "\u30d6", "\u30d7", "\u30d9", "\u30da", "\u30dc", "\u30de", "\u30df", "\u30e1", "\u30e3", "\u30e4", "\u30e5", "\u30e6", "\u30e9", "\u30ea", "\u30eb", "\u30ec", "\u30ed", "\u30ef", "\u30f3", "\u30f4", "\u30fc", "\ua7aa", "\ua7ac", "\ua7ad", "\ua7ae", "\ua7b1", "\ua7b2", "\ua7c5", "\uac70", "\ub9c8", "\ub9c9", "\ub9d0", "\uc0ac", "\uc778", "\uc804", "\uc9c0", "\uc9d3", "\ud22c", "\ufb01"]

checkpoints/detector.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b7d50c74b2dba9acb8dd76d2fbcf75e6eeae0cb3e9688edf42c91aa5550ade1
+size 181677320

checkpoints/recognizer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db307d9b0dcb6cd15ab6c71e302fd62ca90ce077c3013c9f63a4ba0dbfdf3f50
+size 19823477

checkpoints/relational.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1db5a62853269aabd8a040eeb05038a871032e8275def77653631657cb8ca4a
+size 9048309

example.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from nemo_retriever_ocr.inference.pipeline import NemoRetrieverOCR
+def main(image_path, merge_level, no_visualize, model_dir):
+    ocr_pipeline = NemoRetrieverOCR()
+    predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
+    print(f"Found {len(predictions)} text regions.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run OCR inference and annotate image.")
+    parser.add_argument("image_path", type=str, help="Path to the input image.")
+    parser.add_argument(
+        "--merge-level",
+        type=str,
+        choices=["word", "sentence", "paragraph"],
+        default="paragraph",
+        help="Merge level for OCR output (word, sentence, paragraph).",
+    )
+    parser.add_argument("--no-visualize", action="store_true", help="Do not save the annotated image.")
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        help="Path to the model checkpoints.",
+        default="./checkpoints",
+    )
+    args = parser.parse_args()
+    main(
+        args.image_path,
+        merge_level=args.merge_level,
+        no_visualize=args.no_visualize,
+        model_dir=args.model_dir,
+    )

nemo-retriever-ocr/cpp/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ load_png/wuffs-v0.3.c filter=lfs diff=lfs merge=lfs -text

nemo-retriever-ocr/cpp/.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+.vscode
+build
+*.egg-info
+dist
+.vs

nemo-retriever-ocr/cpp/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "trove"]
+	path = trove
+	url = https://github.com/bryancatanzaro/trove.git

nemo-retriever-ocr/cpp/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# Optimized Image Operations for PyTorch
+## Installation
+```
+python setup.py install
+```
+## Usage
+```
+# It's important that you do this first
+import torch
+from pytorch_image_ops import color_transform, spatial_transform
+```

nemo-retriever-ocr/cpp/beam_decode/beam_decode.cpp ADDED Viewed

	@@ -0,0 +1,460 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "beam_decode.h"
+#include <vector>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <unordered_set>
+#include <set>
+#include <algorithm>
+#include <chrono>
+#include "../common.h"
+#include "prefix.h"
+#include "log_sum_exp.h"
+#include "sbo_lm.h"
+using namespace std;
+template<typename scalar_t>
+using pred_seq_t = torch::TensorAccessor<scalar_t, 2>;
+struct PrefixScore
+{
+    float_t lProbBlank;
+    float_t lProbChar;
+    // float_t raw_lProbBlank;
+    // float_t raw_lProbChar;
+    mutable float_t _lProb;
+    PrefixScore(float_t lProbBlank = NEG_INF /* log P(0) */, float_t lProbChar = NEG_INF /* log P(0) */)
+        : lProbBlank(lProbBlank), lProbChar(lProbChar), _lProb(NEG_INF)
+        //   , raw_lProbBlank(lProbBlank), raw_lProbChar(lProbChar)
+    {}
+    float_t get_lScore() const {
+        if (_lProb == NEG_INF) {
+            _lProb = log_sum_exp(lProbBlank, lProbChar);
+        }
+        return _lProb;
+    }
+    // float_t get_raw_lScore() const {
+    //     return log_sum_exp(raw_lProbBlank, raw_lProbChar);
+    // }
+};
+typedef std::unordered_map<Prefix*, PrefixScore> PrefixMap;
+typedef std::pair<Prefix*, PrefixScore> BeamItem;
+typedef std::vector<BeamItem> Beam;
+/*
+    Allows us to get an estimate of the vision model confidence, irrespective of how the language
+    model guided the decoding. NOTE: This scoring could follow an entirely different path than
+    the returned decoded sequence.
+*/
+template<typename scalar_t>
+scalar_t get_vision_confidence(const pred_seq_t<scalar_t> &logProbs, scalar_t minProb)
+{
+    const int64_t T = logProbs.size(0);
+    const int64_t S = logProbs.size(1);
+    scalar_t ret = 0; // log(1)
+    for (size_t t = 0; t < T; ++t) {
+        float_t maxP = logProbs[t][0];
+        int64_t maxC = 0;
+        for (int64_t c = 1; c < S; ++c) {
+            float_t p = logProbs[t][c];
+            if (p > maxP) {
+                maxP = p;
+                maxC = c;
+            }
+        }
+        ret += maxP;
+        // Ignore everything past the sequence terminator
+        if (maxC == 1) {
+            break;
+        }
+        if (ret < minProb) {
+            break;
+        }
+    }
+    return ret;
+}
+template<typename scalar_t>
+pair<vector<token_t>, float_t>
+    ctc_beam_decode_impl(const pred_seq_t<scalar_t> &probs, const int64_t beamSize,
+                         const int64_t blank, scalar_t minProb,
+                         const LanguageModel &langModel, scalar_t lmWeight)
+{
+    if (blank != 0) {
+        throw runtime_error("Currently, only ordinal 0 supported for the blank prediction");
+    }
+    const int64_t T = probs.size(0);
+    const int64_t S = probs.size(1);
+    // NOTE: In log space, the following is true:
+    // 1. Adding two probabilities: log_sum_exp(l_p_a, l_p_b)
+    // 2. Multiplying two probabilities: l_p_a + l_p_b
+    // 3. log P(0) = -inf
+    // 4. log P(1) = 0
+    // Convert to log-space
+    if (minProb > 0) {
+        minProb = log(minProb);
+    } else {
+        minProb = NEG_INF;
+    }
+    auto retScore = get_vision_confidence(probs, minProb);
+    if (retScore < minProb) {
+        return { {}, NEG_INF };
+    }
+    PrefixAllocator prefixAlloc;
+    Beam beam;
+    beam.emplace_back(prefixAlloc.GetPrefix(), PrefixScore{0, NEG_INF}); // Add a dummy first node
+    Beam terminated;
+    typedef tuple<Prefix*, token_t> lm_cache_key_t;
+    unordered_map<lm_cache_key_t, float_t> lmScoreCache;
+    for (int64_t t = 0; t < T; ++t) {
+        PrefixMap nextBeam;
+        // Add all of the completed paths to the next beam.
+        // This allows us to accumulate new paths into these,
+        // but otherwise not process them
+        for (const BeamItem &prevNode : beam) {
+            if (prevNode.first->Token == 1) {
+                nextBeam.insert(prevNode);
+            }
+        }
+        // Loop over vocab
+        for (int64_t s = 0; s < S; ++s) {
+            float_t lpEmit = probs[t][s];
+            if (lpEmit < minProb) {
+                continue;
+            }
+            for (const BeamItem &prevNode : beam) {
+                Prefix *prevPrefix = prevNode.first;
+                const PrefixScore &prevScore = prevNode.second;
+                // Ignore already completed paths
+                if (prevPrefix->Token == 1) {
+                    continue;
+                }
+                // Ignore impossible paths
+                if (prevScore.lProbBlank == NEG_INF && prevScore.lProbChar == NEG_INF) {
+                    continue;
+                }
+                // If we propose a blank the prefix doesn't change.
+                // Only the probability of ending in blank gets updated.
+                if (s == blank) {
+                    PrefixScore &score = nextBeam[prevPrefix];
+                    score.lProbBlank     = log_sum_exp(score.lProbBlank    , prevScore.lProbBlank     + lpEmit, prevScore.lProbChar     + lpEmit);
+                    // score.raw_lProbBlank = log_sum_exp(score.raw_lProbBlank, prevScore.raw_lProbBlank + lpEmit, prevScore.raw_lProbChar + lpEmit);
+                    continue;
+                }
+                // Extend the prefix by the new character s and add it to the beam.
+                // Only the probability of not ending in blank gets updated.
+                token_t prevToken = prevPrefix->Token;
+                // NOTE: We always create a new prefix regardless of duplication because the PrefixScore
+                // is simultaneously tracking prefixes that do and don't end in a blank. And it's those
+                // that end in a blank that would cause the prefix to be extended.
+                auto extendPrefix = prefixAlloc.GetPrefix(s, prevPrefix);
+                // Evaluate the language model, but use the cache if we've already considered this string before
+                auto lmCacheItem = make_tuple(prevPrefix, s);
+                auto lmCacheIter = lmScoreCache.find(lmCacheItem);
+                float_t lpLang = 0;
+                if (lmCacheIter == lmScoreCache.end()) {
+                    lpLang = langModel.ScoreTransition(prevPrefix, s);
+                    lpLang *= lmWeight;
+                    lmCacheIter = lmScoreCache.emplace(lmCacheItem, lpLang).first;
+                }
+                lpLang = lmCacheIter->second;
+                PrefixScore &extendScore = nextBeam[extendPrefix];
+                // Remember, adding two log probabilities is equivalent to multiplying two probabilities
+                if (s != prevToken) {
+                    extendScore.lProbChar     = log_sum_exp(extendScore.lProbChar,     prevScore.lProbBlank     + lpEmit + lpLang, prevScore.lProbChar     + lpEmit + lpLang);
+                    // extendScore.raw_lProbChar = log_sum_exp(extendScore.raw_lProbChar, prevScore.raw_lProbBlank + lpEmit         , prevScore.raw_lProbChar + lpEmit         );
+                } else {
+                    // We don't include the previous probability of not ending in blank if s is repeated at the end. The CTC
+                    // algorithm merges characters not separated by a blank.
+                    extendScore.lProbChar     = log_sum_exp(extendScore.lProbChar    , prevScore.lProbBlank     + lpEmit + lpLang);
+                    // extendScore.raw_lProbChar = log_sum_exp(extendScore.raw_lProbChar, prevScore.raw_lProbBlank + lpEmit         );
+                }
+                // If the token is repeated, we also have to deal with the unchanged prefix since repeated characters are collapsed
+                if (s == prevToken) {
+                    PrefixScore &collapseScore = nextBeam[prevPrefix];
+                    collapseScore.lProbChar     = log_sum_exp(collapseScore.lProbChar    , prevScore.lProbChar     + lpEmit);
+                    // collapseScore.raw_lProbChar = log_sum_exp(collapseScore.raw_lProbChar, prevScore.raw_lProbChar + lpEmit);
+                }
+            }
+        }
+        Beam vecNextBeam(begin(nextBeam), end(nextBeam));
+        if (vecNextBeam.size() > beamSize) {
+            partial_sort(begin(vecNextBeam), begin(vecNextBeam) + beamSize, end(vecNextBeam),
+                [] (const BeamItem &a, const BeamItem &b) {
+                    return a.second.get_lScore() > b.second.get_lScore();
+                }
+            );
+            vecNextBeam.resize(beamSize);
+        }
+        beam = move(vecNextBeam);
+    }
+    // Find the best raw score
+    const BeamItem *bestItem = nullptr;
+    // for (const BeamItem &b : beam) {
+    //     if (bestItem == nullptr or b.second.get_raw_lScore() > bestItem->second.get_raw_lScore()) {
+    //         bestItem = &b;
+    //     }
+    // }
+    if (! beam.empty()) {
+        bestItem = &beam[0];
+    }
+    if (bestItem != nullptr) {
+        auto retList = bestItem->first->ToList();
+        return { move(retList), retScore };
+    } else {
+        return { {}, NEG_INF };
+    }
+}
+typedef std::pair<Prefix*, float_t> RegBeamItem;
+bool operator<(const RegBeamItem &a, const RegBeamItem &b) {
+    return a.second > b.second;
+}
+template<typename scalar_t>
+pair<vector<token_t>, float_t>
+    reg_beam_decode_impl(const pred_seq_t<scalar_t> &logProbs, const int64_t beamSize,
+                         scalar_t minProb,
+                         const LanguageModel &langModel, scalar_t lmWeight)
+{
+    const int64_t T = logProbs.size(0);
+    const int64_t S = logProbs.size(1);
+    // NOTE: In log space, the following is true:
+    // 1. Adding two probabilities: log_sum_exp(l_p_a, l_p_b)
+    // 2. Multiplying two probabilities: l_p_a + l_p_b
+    // 3. log P(0) = -inf
+    // 4. log P(1) = 0
+    // Convert to log-space
+    if (minProb > 0) {
+        minProb = log(minProb);
+    } else {
+        minProb = NEG_INF;
+    }
+    auto retScore = get_vision_confidence(logProbs, minProb);
+    if (retScore < minProb) {
+        return { {}, NEG_INF };
+    }
+    PrefixAllocator prefixAlloc;
+    vector<RegBeamItem> beam, nextBeam;
+    beam.emplace_back(prefixAlloc.GetPrefix(), 0); // log(1) = 0
+    for (int64_t t = 0; t < T && !beam.empty(); ++t) {
+        nextBeam.clear();
+        auto addToBeam = [&nextBeam, beamSize] (const RegBeamItem &rbi) {
+            nextBeam.push_back(rbi);
+        };
+        // Expand each path in the beam
+        for (const RegBeamItem &prevNode : beam) {
+            if (prevNode.first->Token == 1) {
+                // Move completed paths along without processing further
+                addToBeam(prevNode);
+                continue;
+            }
+            Prefix *prevPrefix = prevNode.first;
+            float_t prevScore = prevNode.second;
+            // Loop over vocab
+            for (int64_t s = 0; s < S; ++s) {
+                float_t lpEmit = logProbs[t][s];
+                if (lpEmit < minProb) {
+                    // The probability dropped below threshold, so stop processing this path
+                    continue;
+                }
+                auto extendPrefix = prefixAlloc.GetPrefix(s, prevPrefix);
+                float_t lpLang = langModel.ScoreTransition(prevPrefix, s);
+                float_t lpNext = prevScore + lpLang + lpEmit;
+                addToBeam({extendPrefix, lpNext});
+            }
+        }
+        if (nextBeam.size() > beamSize) {
+            // Find the top-k items, and then truncate the rest
+            partial_sort(begin(nextBeam), begin(nextBeam) + beamSize, end(nextBeam));
+            nextBeam.resize(beamSize);
+        }
+        std::swap(beam, nextBeam);
+    }
+    if (!beam.empty()) {
+        // The highest probability element will always be in the back
+        RegBeamItem rbi{ nullptr, NEG_INF };
+        for (auto &rb : beam) {
+            if (rbi.first == nullptr || rb.second > rbi.second) {
+                rbi = rb;
+            }
+        }
+        auto retList = rbi.first->ToList();
+        return { move(retList), retScore };
+    } else {
+        return { {}, NEG_INF };
+    }
+}
+template<typename scalar_t>
+void dp_beam_decode_impl(const torch::TensorAccessor<scalar_t, 3> &probsAccess,
+                         torch::TensorAccessor<int64_t, 2> retAccess,
+                         torch::TensorAccessor<scalar_t, 1> confAccess,
+                         int64_t beamSize, int64_t blank,
+                         scalar_t minProb,
+                         const LanguageModel *langModel,
+                         scalar_t lmWeight,
+                         bool combineDuplicates)
+{
+    const int64_t N = probsAccess.size(0);
+    #pragma omp parallel for num_threads(8)
+    for (int64_t i = 0; i < N; ++i) {
+        vector<token_t> seq;
+        float_t lConf;
+        if (combineDuplicates) {
+            tie(seq, lConf) = ctc_beam_decode_impl(probsAccess[i], beamSize, blank,
+                                                   minProb,
+                                                   *langModel, lmWeight);
+        } else {
+            tie(seq, lConf) = reg_beam_decode_impl(probsAccess[i], beamSize,
+                                                   minProb,
+                                                   *langModel, lmWeight);
+        }
+        int64_t sz = min<int64_t>(seq.size(), retAccess.size(1));
+        for (int64_t k = 0; k < sz; ++k) {
+            retAccess[i][k] = seq[k];
+        }
+        confAccess[i] = exp(lConf);
+    }
+}
+std::tuple<torch::Tensor, torch::Tensor>
+    beam_decode(torch::Tensor probs, int64_t beamSize, int64_t blank,
+                float minProb,
+                const LanguageModel *langModel,
+                float lmWeight,
+                bool combineDuplicates)
+{
+    if (langModel == nullptr) {
+        langModel = &NullLanguageModel;
+    }
+    auto tStart = chrono::high_resolution_clock::now();
+    probs = probs.contiguous();
+    bool collapse = false;
+    if (probs.dim() == 2) {
+        // N,T,C
+        probs = probs.unsqueeze(0);
+        collapse = true;
+    }
+    probs = probs.log();
+    torch::Tensor ret = torch::ones({ probs.size(0), probs.size(1) }, torch::kInt64);
+    torch::Tensor conf = torch::zeros({ probs.size(0) }, probs.options());
+    auto retAccess = ret.accessor<int64_t, 2>();
+    AT_DISPATCH_FLOATING_TYPES(
+        probs.scalar_type(),
+        "cpu_beam_decode",
+        ([&] {
+            dp_beam_decode_impl(
+                probs.accessor<scalar_t, 3>(),
+                retAccess,
+                conf.accessor<scalar_t, 1>(),
+                beamSize, blank,
+                static_cast<scalar_t>(minProb),
+                langModel,
+                static_cast<scalar_t>(lmWeight),
+                combineDuplicates
+            );
+        })
+    );
+    if (collapse) {
+        ret = ret.squeeze(0);
+        conf = conf[0];
+    }
+    auto tEnd = chrono::high_resolution_clock::now();
+    typedef chrono::duration<double, std::milli> tp_t;
+    tp_t totalElapsed = tEnd - tStart;
+    cout << "Beam Decode " << probs.size(0) << " - "
+         << "Total: " << totalElapsed.count() << "ms"
+         << endl;
+    return { ret, conf };
+}
+std::unique_ptr<LanguageModel> create_sbo_lm(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoffWeight)
+{
+    return make_unique<SBO_LanguageModel>(dataFilePath, move(tokenMapping), backoffWeight);
+}

nemo-retriever-ocr/cpp/beam_decode/beam_decode.h ADDED Viewed

	@@ -0,0 +1,18 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "language_model.h"
+std::tuple<torch::Tensor, torch::Tensor>
+    beam_decode(torch::Tensor probs, int64_t beamSize, int64_t blank,
+                float minProb,
+                const LanguageModel *langModel,
+                float lmWeight,
+                bool combineDuplicates);
+std::unique_ptr<LanguageModel> create_sbo_lm(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoffWeight);

nemo-retriever-ocr/cpp/beam_decode/kn_lm.cpp ADDED Viewed

	@@ -0,0 +1,86 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "kn_lm.h"
+using namespace std;
+KN_LanguageModel::KN_LanguageModel(const string &dataFilePath, token_mapping_t tokenMapping, float_t knDelta)
+    : NGramLMBase(dataFilePath, move(tokenMapping)), m_knDelta(knDelta)
+{
+}
+float KN_LanguageModel::ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const
+{
+    if (prefix.empty()) {
+        return ScoreUnigram(suffix);
+    } else {
+        return ScoreTransition(prefix, suffix);
+    }
+}
+float_t KN_LanguageModel::ScoreUnigram(const std::wstring &uni) const
+{
+    auto lIter = m_lookup[1].find(L""s);
+    if (lIter == m_lookup[1].end()) {
+        throw std::runtime_error("Unigrams not supported by this model!");
+    }
+    auto uniIter = lIter->second.find(uni);
+    float_t ctUni = 1e-8;
+    if (uniIter != lIter->second.end()) {
+        ctUni = uniIter->second;
+    }
+    float_t ctSuffixes = GetPrefixSum(L""s);
+    return ctUni / ctSuffixes;
+}
+float_t KN_LanguageModel::ScoreTransition(const std::wstring &prefix, const std::wstring &suffix) const
+{
+    if (prefix.empty()) {
+        // The number of distinct bigrams that end with this token
+        auto rlIter = m_reverseLookup.find(suffix);
+        float_t ctEndingBigrams = 0;
+        if (rlIter != m_reverseLookup.end()) {
+            ctEndingBigrams = rlIter->second[2].size();
+        }
+        float_t ctAllBigrams = m_lookup[2].size();
+        return ctEndingBigrams / ctAllBigrams;
+    }
+    auto lIter = m_lookup[prefix.size() + 1].find(prefix);
+    float_t ctUqSuffixes = 0;
+    float_t ctSuffixes = 0;
+    float_t ctSuffix = 0;
+    if (lIter != m_lookup[prefix.size() + 1].end()) {
+        ctUqSuffixes = lIter->second.size();
+        ctSuffixes = GetPrefixSum(prefix);
+        auto sIter = lIter->second.find(suffix);
+        if (sIter != lIter->second.end()) {
+            ctSuffix = sIter->second;
+        }
+    }
+    float_t factor = 0;
+    float_t main = 0;
+    if (ctSuffixes != 0) {
+        factor = m_knDelta * ctUqSuffixes / ctSuffixes;
+        // TODO: Figure out how to make this call without copying the string!
+        factor *= ScoreTransition({begin(prefix) + 1, end(prefix)}, suffix);
+        main = max<float_t>(ctSuffix - m_knDelta, 0) / ctSuffixes;
+    }
+    float_t total = main + factor;
+    return total;
+}

nemo-retriever-ocr/cpp/beam_decode/kn_lm.h ADDED Viewed

	@@ -0,0 +1,27 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "ngram_lm_base.h"
+class KN_LanguageModel
+    : public NGramLMBase
+{
+public:
+    KN_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t knDelta);
+protected:
+    virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const override;
+private:
+    float_t ScoreUnigram(const std::wstring &uni) const;
+    float_t ScoreTransition(const std::wstring &prefix, const std::wstring &suffix) const;
+    float_t m_knDelta;
+};

nemo-retriever-ocr/cpp/beam_decode/language_model.cpp ADDED Viewed

	@@ -0,0 +1,147 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "language_model.h"
+#include <locale>
+#include <codecvt>
+using namespace std;
+const NullLanguageModel_t NullLanguageModel;
+NullLanguageModel_t::NullLanguageModel_t()
+    : LanguageModel({})
+{
+}
+TokenMappingWrapper::TokenMappingWrapper(token_mapping_t mapping)
+    : token_mapping(move(mapping))
+{
+    for (const auto &mp : token_mapping) {
+        if (mp.second.size() == 1) {
+            wchar_t c = mp.second.front();
+            reverse_token_mapping.emplace(c, mp.first);
+        }
+    }
+}
+TokenMappingWrapper::Ptr create_token_mapping(token_mapping_t tokenMapping)
+{
+    return make_shared<TokenMappingWrapper>(move(tokenMapping));
+}
+template<typename token_t>
+vector<tuple<wstring, float>>
+    decode_sequences_impl(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
+                          c10::optional<torch::Tensor> probs)
+{
+    const token_mapping_t &mapping = tokenMapping->token_mapping;
+    auto tokensAccess = tokens.accessor<token_t, 2>();
+    torch::Tensor pTens = probs.value_or(torch::ones({ tokens.size(0) }, torch::kFloat32));
+    if (pTens.dim() == 1) {
+        pTens = pTens.unsqueeze(1);
+    }
+    auto probsAccess = pTens.accessor<float, 2>();
+    const int64_t B = tokens.size(0);
+    const int64_t T = tokens.size(1);
+    vector<tuple<wstring, float>> ret;
+    for (int64_t b = 0; b < B; ++b) {
+        wstring buff;
+        float logProb = 0.0f; // log 1
+        bool done = false;
+        for (int64_t t = 0; t < T && ! done; ++t) {
+            typename token_mapping_t::key_type tokIdx = tokensAccess[b][t];
+            if (t < probsAccess.size(1)) {
+                logProb += log(probsAccess[b][t]);
+            }
+            switch (tokIdx) {
+                case 0:
+                    // Blank char
+                    continue;
+                case 1:
+                    // End of sequence char
+                    done = true;
+                    break;
+                case 2:
+                    buff.push_back('^');
+                    break;
+                default:
+                    auto iter = mapping.find(tokIdx);
+                    if (iter == mapping.end()) {
+                        throw std::runtime_error("The token mapping doesn't contain an entry for index " + to_string(tokIdx));
+                    }
+                    buff += iter->second;
+                    break;
+            }
+        }
+        ret.emplace_back(move(buff), exp(logProb));
+    }
+    return ret;
+}
+vector<tuple<wstring, float>>
+    decode_sequences(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
+                     c10::optional<torch::Tensor> probs)
+{
+    if (tokens.dim() != 2) {
+        throw std::runtime_error("`tokens` must be 2-dimensions of type B,T!");
+    }
+    if (tokenMapping == nullptr) {
+        throw std::runtime_error("Cannot supply a null token mapping!");
+    }
+    const token_mapping_t &mapping = tokenMapping->token_mapping;
+    if (mapping.empty()) {
+        throw std::runtime_error("The token mapping hasn't been initialized!");
+    }
+    if (probs.has_value()) {
+        if (probs.value().scalar_type() != torch::kFloat32) {
+            throw std::runtime_error("If the probability distribution is specified, then it must be of type `torch.float32`");
+        }
+        if (probs.value().size(0) != tokens.size(0)) {
+            throw std::runtime_error("The probability distribution batch size doesn't match the tokens batch size!");
+        }
+        if (probs.value().dim() == 2 && probs.value().size(1) != tokens.size(1)) {
+            throw std::runtime_error("Invalid probability distribution shape!");
+        }
+    }
+    vector<tuple<wstring, float>> ret;
+    AT_DISPATCH_INTEGRAL_TYPES(
+        tokens.scalar_type(),
+        "decode_sequences_impl",
+        ([&] {
+            ret = decode_sequences_impl<scalar_t>(tokens, tokenMapping, probs);
+        })
+    );
+    return ret;
+}
+std::string ws2s(const std::wstring& wstr)
+{
+    using convert_typeX = std::codecvt_utf8<wchar_t>;
+    std::wstring_convert<convert_typeX, wchar_t> converterX;
+    return converterX.to_bytes(wstr);
+}

nemo-retriever-ocr/cpp/beam_decode/language_model.h ADDED Viewed

	@@ -0,0 +1,66 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <memory>
+#include <torch/torch.h>
+#include "prefix.h"
+#include "log_sum_exp.h"
+typedef std::unordered_map<int64_t, std::wstring> token_mapping_t;
+typedef std::unordered_map<wchar_t, int64_t> reverse_token_mapping_t;
+class LanguageModel
+{
+public:
+    virtual ~LanguageModel() {}
+    virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const = 0;
+    const token_mapping_t &TokenMapping() const { return m_tokenMapping; }
+protected:
+    LanguageModel(token_mapping_t tokenMapping)
+        : m_tokenMapping(std::move(tokenMapping))
+    {}
+    token_mapping_t m_tokenMapping;
+};
+class NullLanguageModel_t
+    : public LanguageModel
+{
+public:
+    NullLanguageModel_t();
+    virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const override
+    {
+        // log P(1)
+        // Which means the probability is unchanged
+        return 0;
+    }
+};
+extern const NullLanguageModel_t NullLanguageModel;
+struct TokenMappingWrapper
+{
+    typedef std::shared_ptr<TokenMappingWrapper> Ptr;
+    TokenMappingWrapper(token_mapping_t mapping);
+    token_mapping_t token_mapping;
+    reverse_token_mapping_t reverse_token_mapping;
+};
+TokenMappingWrapper::Ptr create_token_mapping(token_mapping_t tokenMapping);
+std::vector<std::tuple<std::wstring, float>>
+    decode_sequences(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
+                     c10::optional<torch::Tensor> probs = torch::nullopt);

nemo-retriever-ocr/cpp/beam_decode/log_sum_exp.cpp ADDED Viewed

	@@ -0,0 +1,7 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "log_sum_exp.h"
+const float_t NEG_INF = -std::numeric_limits<float_t>::infinity();

nemo-retriever-ocr/cpp/beam_decode/log_sum_exp.h ADDED Viewed

	@@ -0,0 +1,54 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <cmath>
+#include <limits>
+#include <algorithm>
+typedef float float_t;
+extern const float_t NEG_INF;
+template<typename T>
+inline T max_val(T v)
+{
+    return v;
+}
+template<typename T, typename ...Args>
+inline T max_val(T v, Args... rest)
+{
+    auto restMax = max_val(rest...);
+    return std::max(v, restMax);
+}
+template<typename T>
+inline T sum_exp(T maxVal, T v)
+{
+    return std::exp(v - maxVal);
+}
+template<typename T, typename ...Args>
+inline T sum_exp(T maxVal, T v, Args... rest)
+{
+    auto restSum = sum_exp(maxVal, rest...);
+    return sum_exp(maxVal, v) + restSum;
+}
+template<typename T, typename ...Args>
+inline T log_sum_exp(T v, Args ...args)
+{
+    auto maxVal = max_val(v, args...);
+    if (maxVal == -std::numeric_limits<T>::infinity()) {
+        return -std::numeric_limits<T>::infinity();
+    }
+    auto sumExp = sum_exp(maxVal, v, args...);
+    return maxVal + std::log(sumExp);
+}

nemo-retriever-ocr/cpp/beam_decode/ngram_lm_base.cpp ADDED Viewed

	@@ -0,0 +1,330 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "ngram_lm_base.h"
+#include <iostream>
+#include <fstream>
+#if defined( USE_BOOST )
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/vector.hpp>
+#include <boost/serialization/string.hpp>
+#include <boost/serialization/unordered_map.hpp>
+#endif // USE_BOOST
+using namespace std;
+const std::wstring WORD_END(1, 2);
+const std::wstring NUMERIC(1, 3);
+const std::wstring UNMODELED(1, 4);
+struct LMStorage
+{
+    lookup_t Lookup;
+    reverse_lookup_t ReverseLookup;
+    template<class Archive>
+    void serialize(Archive &ar, const unsigned int version) {
+        ar & Lookup;
+        ar & ReverseLookup;
+    }
+};
+void save_suffix_map(std::fstream& fs, const suffix_map_t& suffix_map)
+{
+    // write out number of elements for Lookup
+    std::size_t suffix_map_count = suffix_map.size();
+    fs.write((char*)(&suffix_map_count), sizeof(suffix_map_count));
+    for (suffix_map_t::const_iterator reverse_lookup_it = suffix_map.begin(); reverse_lookup_it != suffix_map.end(); ++reverse_lookup_it)
+    {
+        // write out the key
+        size_t key_len = reverse_lookup_it->first.length();
+        fs.write((char*)(&key_len), sizeof(key_len));
+        fs.write((char*)(reverse_lookup_it->first.data()), key_len * sizeof(wchar_t));
+        // write out value
+        fs.write((char*)(&reverse_lookup_it->second), sizeof(reverse_lookup_it->second));
+    }
+}
+void save_lookup(std::fstream& fs, const lookup_t& lookup)
+{
+    // write out number of elements for Lookup
+    std::size_t lookup_count = lookup.size();
+    fs.write((char*)(&lookup_count), sizeof(lookup_count));
+    for (lookup_t::const_iterator lookup_it = lookup.begin(); lookup_it != lookup.end(); ++lookup_it)
+    {
+        // write out element map size
+        std::size_t map_elem_count = lookup_it->size();
+        fs.write((char*)(&map_elem_count), sizeof(map_elem_count));
+        for (string_suffix_map_t::const_iterator str_sfx_it = lookup_it->begin(); str_sfx_it != lookup_it->end(); ++str_sfx_it)
+        {
+            // write out key
+            size_t key_len = str_sfx_it->first.length();
+            fs.write((char*)(&key_len), sizeof(key_len));
+            fs.write((char*)(str_sfx_it->first.data()), key_len * sizeof(wchar_t));
+            save_suffix_map(fs, str_sfx_it->second);
+        }
+    }
+}
+void save_reverse_lookup(std::fstream& fs, const reverse_lookup_t& reverse_lookup)
+{
+    // write out number of elements for Lookup
+    std::size_t reverse_lookup_count = reverse_lookup.size();
+    fs.write((char*)(&reverse_lookup_count), sizeof(reverse_lookup_count));
+    for (reverse_lookup_t::const_iterator reverse_lookup_it = reverse_lookup.begin(); reverse_lookup_it != reverse_lookup.end(); ++reverse_lookup_it)
+    {
+        // write out the key
+        size_t key_len = reverse_lookup_it->first.length();
+        fs.write((char*)(&key_len), sizeof(key_len));
+        fs.write((char*)(reverse_lookup_it->first.data()), key_len * sizeof(wchar_t));
+        // write out value vector length
+        size_t val_vec_len = reverse_lookup_it->second.size();
+        fs.write((char*)(&val_vec_len), sizeof(val_vec_len));
+        for (suffix_map_vec_t::const_iterator val_vec_it = reverse_lookup_it->second.begin();
+            val_vec_it != reverse_lookup_it->second.end();
+            ++val_vec_it)
+        {
+            save_suffix_map(fs, *val_vec_it);
+        }
+    }
+}
+void load_suffix_map(std::fstream& fs, suffix_map_t& suffix_map)
+{
+    // read in number of elements
+    std::size_t suffix_map_count = 0;
+    fs.read((char*)(&suffix_map_count), sizeof(suffix_map_count));
+    for (size_t suffix_map_index = 0; suffix_map_index < suffix_map_count; ++suffix_map_index )
+    {
+        // read in key
+        std::size_t key_len = 0;
+        fs.read((char*)(&key_len), sizeof(key_len));
+        std::wstring wkey(key_len, 0);
+        fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
+        uint32_t value = 0;
+        fs.read((char*)(&value), sizeof(value));
+        suffix_map.insert(std::make_pair(wkey, value));
+    }
+}
+void load_lookup(std::fstream& fs, lookup_t& lookup)
+{
+    // read in number of elements
+    std::size_t lookup_count = 0;
+    fs.read((char*)(&lookup_count), sizeof(lookup_count));
+    for (size_t lookup_index = 0; lookup_index < lookup_count; ++lookup_index)
+    {
+        std::size_t map_elem_count = 0;
+        fs.read((char*)(&map_elem_count), sizeof(map_elem_count));
+        lookup.push_back(string_suffix_map_t());
+        string_suffix_map_t& str_sfx_map = lookup.back();
+        for (size_t str_sfx_map_index = 0; str_sfx_map_index < map_elem_count; ++str_sfx_map_index)
+        {
+            std::size_t key_len = 0;
+            fs.read((char*)(&key_len), sizeof(key_len));
+            std::wstring wkey(key_len, 0);
+            fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
+            str_sfx_map.insert(std::make_pair<wstring, suffix_map_t>(std::wstring(wkey), suffix_map_t()));
+            suffix_map_t& suffix_map = str_sfx_map[wkey];
+            load_suffix_map(fs, suffix_map);
+        }
+    }
+}
+void load_reverse_lookup(std::fstream& fs, reverse_lookup_t& reverse_lookup)
+{
+    // read in number of elements
+    std::size_t reverse_lookup_count = 0;
+    fs.read((char*)(&reverse_lookup_count), sizeof(reverse_lookup_count));
+    for (size_t rev_lookup_index = 0; rev_lookup_index < reverse_lookup_count; ++rev_lookup_index )
+    {
+        // read in the key
+        std::size_t key_len = 0;
+        fs.read((char*)(&key_len), sizeof(key_len));
+        std::wstring wkey(key_len, 0);
+        fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
+        reverse_lookup.insert(std::make_pair(wkey, suffix_map_vec_t()));
+        suffix_map_vec_t& val_vec = reverse_lookup[wkey];
+        std::size_t val_vec_len = 0;
+        fs.read((char*)(&val_vec_len), sizeof(val_vec_len));
+        for (size_t val_vec_index = 0; val_vec_index < val_vec_len; ++val_vec_index)
+        {
+            val_vec.push_back(suffix_map_t());
+            suffix_map_t& suffix_map = val_vec.back();
+            load_suffix_map(fs, suffix_map);
+        }
+    }
+}
+#if ! defined( USE_BOOST )
+NGramLMBase::NGramLMBase(const string &dataFilePath, token_mapping_t tokenMapping)
+    : LanguageModel(move(tokenMapping))
+{
+    std::fstream in(dataFilePath, std::ios::in | std::ios::binary);
+    load_lookup(in, m_lookup);
+    load_reverse_lookup(in, m_reverseLookup);
+    if (m_lookup.size() >= 10) {
+        throw runtime_error("Only N-Grams of 9 or less are supported!");
+    }
+    for (auto &ngLevel : m_lookup) {
+        for (auto &kvPrefixLevel : ngLevel) {
+            uint32_t ct = 0;
+            for (auto &kvSfx : kvPrefixLevel.second) {
+                ct += kvSfx.second;
+            }
+            m_prefixSumLookup.emplace(kvPrefixLevel.first, ct);
+        }
+    }
+}
+void save_ngram_data_file(const lookup_t& lookup, const reverse_lookup_t& reverseLookup, const std::string &outputPath)
+{
+    std::fstream out(outputPath, std::ios::out | std::ios::binary);
+    save_lookup(out, lookup);
+    save_reverse_lookup(out, reverseLookup);
+}
+#else // USE_BOOST
+NGramLMBase::NGramLMBase(const string &dataFilePath, token_mapping_t tokenMapping)
+    : LanguageModel(move(tokenMapping))
+{
+    {
+        ifstream dfStr(dataFilePath, ios_base::in | ios_base::binary);
+        boost::archive::binary_iarchive ia(dfStr);
+        LMStorage s;
+        ia >> s;
+        m_lookup = move(s.Lookup);
+        m_reverseLookup = move(s.ReverseLookup);
+    }
+    if (m_lookup.size() >= 10) {
+        throw runtime_error("Only N-Grams of 9 or less are supported!");
+    }
+    for (auto &ngLevel : m_lookup) {
+        for (auto &kvPrefixLevel : ngLevel) {
+            uint32_t ct = 0;
+            for (auto &kvSfx : kvPrefixLevel.second) {
+                ct += kvSfx.second;
+            }
+            m_prefixSumLookup.emplace(kvPrefixLevel.first, ct);
+        }
+    }
+}
+void save_ngram_data_file(lookup_t lookup, reverse_lookup_t reverseLookup, const std::string &outputPath)
+{
+    ofstream ofs(outputPath, ios_base::out | ios_base::binary);
+    LMStorage s;
+    s.Lookup = move(lookup);
+    s.ReverseLookup = move(reverseLookup);
+    boost::archive::binary_oarchive oa(ofs);
+    oa << s;
+}
+#endif // USE_BOOST
+float_t NGramLMBase::ScoreTransition(const Prefix *p, token_t nextToken) const
+{
+    std::wstring prefix;
+    if (! ConvertToString(p, prefix)) {
+        return NEG_INF;
+    }
+    const std::wstring *pSuffix = nullptr;
+    if (nextToken != 1) {
+        auto iter = m_tokenMapping.find(nextToken);
+        if (iter == m_tokenMapping.end()) {
+            pSuffix = &UNMODELED;
+        } else {
+            pSuffix = &iter->second;
+            if (iswdigit(pSuffix->at(0))) {
+                pSuffix = &NUMERIC;
+            }
+        }
+    } else {
+        pSuffix = &WORD_END;
+    }
+    float_t ret = ScoreTransitionImpl(prefix, *pSuffix);
+    if (ret > 0) {
+        return log(ret);
+    } else {
+        return NEG_INF;
+    }
+}
+bool NGramLMBase::ConvertToString(const Prefix *p, std::wstring &prefix) const
+{
+    const Prefix *stk[10];
+    int32_t sz = -1;
+    const Prefix *curr = p;
+    decltype(sz) mlSz{(int)m_lookup.size() - 2};
+    while (curr && sz < mlSz) {
+        stk[++sz] = curr;
+        curr = curr->Parent;
+    }
+    // Either blank or empty prefix
+    if (sz < 1) { return true; }
+    --sz;
+    for (; sz >= 0; --sz) {
+        token_t tok = stk[sz]->Token;
+        // End of word token, which maps to the null character
+        if (tok == 1) {
+            prefix.push_back(WORD_END[0]);
+        } else if (tok == 0) {
+            // Do nothing
+        } else {
+            auto iter = m_tokenMapping.find(tok);
+            if (iter == m_tokenMapping.end()) {
+                prefix += UNMODELED;
+            } else {
+                const std::wstring &wChar = iter->second;
+                if (iswdigit(wChar[0])) {
+                    prefix += NUMERIC;
+                } else {
+                    prefix += wChar;
+                }
+            }
+        }
+    }
+    return true;
+}

nemo-retriever-ocr/cpp/beam_decode/ngram_lm_base.h ADDED Viewed

	@@ -0,0 +1,80 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "language_model.h"
+// #define USE_BOOST 1
+typedef std::unordered_map<std::wstring, uint32_t> suffix_map_t;
+/* Tells us the number of suffixes for a given ngram of order K
+   Keys:
+    1. NGram Order
+    2. Prefix
+    3. Suffix
+Value:
+    Count
+*/
+typedef std::unordered_map<std::wstring, suffix_map_t> string_suffix_map_t;
+typedef std::vector<string_suffix_map_t> lookup_t;
+/* Tells us the number of K-gram prefixes found for a given suffix
+   Keys:
+    1. Suffix
+    2. NGram Order
+    3. Prefix
+Values:
+    Count
+*/
+typedef std::vector<suffix_map_t> suffix_map_vec_t;
+typedef std::unordered_map<std::wstring, suffix_map_vec_t> reverse_lookup_t;
+extern const std::wstring WORD_END;
+extern const std::wstring NUMERIC;
+extern const std::wstring UNMODELED;
+class NGramLMBase
+    : public LanguageModel
+{
+public:
+    virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const override;
+protected:
+    NGramLMBase(const std::string &dataFilePath, token_mapping_t tokenMapping);
+    virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const = 0;
+    bool ConvertToString(const Prefix *p, std::wstring &prefix) const;
+    float_t GetPrefixSum(const std::wstring &prefix) const;
+    lookup_t m_lookup;
+    reverse_lookup_t m_reverseLookup;
+    std::unordered_map<std::wstring, uint32_t> m_prefixSumLookup;
+};
+#if ! defined( USE_BOOST )
+void save_ngram_data_file(const lookup_t& lookup, const reverse_lookup_t& reverseLookup, const std::string &output_path);
+#else // USE_BOOST
+void save_ngram_data_file(lookup_t lookup, reverse_lookup_t reverseLookup, const std::string &output_path);
+#endif // USE_BOOST
+inline float_t NGramLMBase::GetPrefixSum(const std::wstring &prefix) const
+{
+    auto iter = m_prefixSumLookup.find(prefix);
+    if (iter == m_prefixSumLookup.end()) {
+        return 0;
+    } else {
+        return iter->second;
+    }
+}

nemo-retriever-ocr/cpp/beam_decode/prefix.cpp ADDED Viewed

	@@ -0,0 +1,23 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "prefix.h"
+using namespace std;
+vector<token_t> Prefix::ToList() const
+{
+    vector<token_t> ret;
+    auto curr = this;
+    while (curr) {
+        if (curr->Token != 0) {
+            ret.push_back(curr->Token);
+        }
+        curr = curr->Parent;
+    }
+    return { rbegin(ret), rend(ret) };
+}

nemo-retriever-ocr/cpp/beam_decode/prefix.h ADDED Viewed

	@@ -0,0 +1,158 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <cstdlib>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <list>
+typedef int32_t token_t;
+class Prefix;
+// typedef std::shared_ptr<Prefix> PrefixPtr;
+class Prefix
+{
+public:
+    token_t Token;
+    Prefix *Parent;
+    Prefix(token_t token = 0 /* blank */, Prefix *parent = nullptr)
+        : Token(token), Parent(parent)
+    {}
+    std::vector<token_t> ToList() const;
+    size_t size() const;
+};
+///// Borrowed from Boost libraries
+template<typename T>
+void hash_combine(size_t & seed, T const& v)
+{
+    seed ^= std::hash<T>()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+/////
+namespace std {
+template<>
+struct hash<Prefix*>
+{
+    size_t operator()(const Prefix *p) const noexcept
+    {
+        size_t seed = 0;
+        while (p) {
+            if (p->Token != 0) {
+                hash_combine(seed, p->Token);
+            }
+            p = p->Parent;
+        }
+        return seed;
+    }
+};
+template<>
+struct hash<tuple<Prefix*, token_t>>
+{
+    size_t operator()(const tuple<Prefix*, token_t> &t) const noexcept
+    {
+        size_t seed = 0;
+        hash_combine(seed, get<0>(t));
+        hash_combine(seed, get<1>(t));
+        return seed;
+    }
+};
+template<>
+struct equal_to<Prefix*>
+{
+    bool operator()(const Prefix *a, const Prefix *b) const noexcept
+    {
+        while (a != nullptr && b != nullptr) {
+            if (a->Token != b->Token) {
+                return false;
+            }
+            a = a->Parent;
+            b = b->Parent;
+        }
+        // If one chain is shorter than the other
+        return a == b;
+    }
+};
+}
+inline size_t Prefix::size() const
+{
+    size_t ret = 0;
+    auto p = this;
+    while (p != nullptr) {
+        ret += 1;
+        p = p->Parent;
+    }
+    return ret;
+}
+class PrefixAllocator
+{
+public:
+    PrefixAllocator() = default;
+    ~PrefixAllocator();
+    template<typename ...Args>
+    Prefix *GetPrefix(Args&& ...ctorArgs);
+private:
+    void AllocateNextBuffer();
+    std::list<Prefix*> m_buffers;
+    size_t m_allocSize = 0;
+    size_t m_currOff = 0;
+};
+inline PrefixAllocator::~PrefixAllocator()
+{
+    for (auto p : m_buffers) {
+        // Prefix is a POD, and are allocated without initializing
+        // to prevent redundant work upfront
+        // delete[] p;
+        free(p);
+    }
+}
+inline void PrefixAllocator::AllocateNextBuffer()
+{
+    size_t nextSize = m_allocSize == 0 ? 1000 : 2 * m_allocSize;
+    // Using malloc here to prevent the ctor of Prefix being called for each item.
+    // Instead, the ctor will be called upon first access using GetPrefix
+    auto pBuff = reinterpret_cast<Prefix*>(malloc(sizeof(Prefix) * nextSize));
+    m_buffers.push_back(pBuff);
+    m_allocSize = nextSize;
+    m_currOff = 0;
+}
+template<typename ...Args>
+Prefix *PrefixAllocator::GetPrefix(Args&& ...ctorArgs)
+{
+    if (m_currOff == m_allocSize) {
+        AllocateNextBuffer();
+    }
+    auto buff = m_buffers.back() + m_currOff;
+    auto ret = new (buff) Prefix(std::forward<Args>(ctorArgs)...);
+    ++m_currOff;
+    return ret;
+}

nemo-retriever-ocr/cpp/beam_decode/sbo_lm.cpp ADDED Viewed

	@@ -0,0 +1,47 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "sbo_lm.h"
+#include <assert.h>
+// Reference paper: https://www.aclweb.org/anthology/D07-1090.pdf
+SBO_LanguageModel::SBO_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoff)
+    : NGramLMBase(dataFilePath, move(tokenMapping)), m_backoff(backoff)
+{
+}
+float SBO_LanguageModel::ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const
+{
+    auto lIter = m_lookup[prefix.size() + 1].find(prefix);
+    // This prefix doesn't exist. Shrink it!
+    if (lIter == m_lookup[prefix.size() + 1].end()) {
+        return m_backoff * ScoreTransitionImpl({ begin(prefix) + 1, end(prefix) }, suffix);
+    }
+    const suffix_map_t &suffixMap = lIter->second;
+    auto sfIter = suffixMap.find(suffix);
+    if (sfIter == suffixMap.end()) {
+        // This is a novel character entirely!
+        if (prefix.empty()) {
+            return 1e-8;
+        } else {
+            return m_backoff * ScoreTransitionImpl({ begin(prefix) + 1, end(prefix) }, suffix);
+        }
+    }
+    float_t ctSuffix = sfIter->second;
+    float_t ctNgram = GetPrefixSum(prefix);
+    float_t score = ctSuffix / ctNgram;
+    assert(score >= 0 && score <= 1);
+    return score;
+}

nemo-retriever-ocr/cpp/beam_decode/sbo_lm.h ADDED Viewed

	@@ -0,0 +1,21 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include "kn_lm.h"
+class SBO_LanguageModel
+    : public NGramLMBase
+{
+public:
+    SBO_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoff);
+protected:
+    virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const override;
+private:
+    float_t m_backoff;
+};

nemo-retriever-ocr/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp ADDED Viewed

	@@ -0,0 +1,94 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "grid_sample.h"
+#include "gpu_grid_sample_utils.cuh"
+template<typename T>
+void indirect_grid_sample_forward_bilinear(torch::TensorAccessor<T, 4> input,
+                                           torch::TensorAccessor<T, 4> grid,
+                                           torch::TensorAccessor<int64_t, 1> inputIndices,
+                                           torch::TensorAccessor<T, 4> output)
+{
+    const int64_t N = inputIndices.size(0);
+    const int64_t C = output.size(1);
+    T fInputHeight = input.size(2);
+    T fInputWidth = input.size(3);
+    int64_t outputHeight = output.size(2);
+    int64_t outputWidth = output.size(3);
+    #pragma omp parallel for num_threads(8)
+    for (int64_t i = 0; i < N; ++i) {
+        int64_t inputIdx = inputIndices[i];
+        for (int64_t c = 0; c < C; ++c) {
+            for (int64_t outY = 0; outY < outputHeight; ++outY) {
+                for (int64_t outX = 0; outX < outputWidth; ++outX) {
+                    T u = grid[i][outY][outX][0];
+                    T v = grid[i][outY][outX][1];
+                    if (u < -1 || u > 1 || v < -1 || v > 1) {
+                        output[i][c][outY][outX] = 0;
+                        continue;
+                    }
+                    // Denormalize the coordinates
+                    u = (u + 1) * ((fInputWidth - 1) / 2);
+                    v = (v + 1) * ((fInputHeight - 1) / 2);
+                    // Calculate coordinates
+                    const T inX = u;
+                    const T inXint = std::floor(inX);
+                    const T inXfrac = inX - inXint;
+                    const T inY = v;
+                    const T inYint = std::floor(inY);
+                    const T inYfrac = inY - inYint;
+                    T ps[] = { 1 - inXfrac, inXfrac };
+                    T rs[] = { 1 - inYfrac, inYfrac };
+                    T opVal = 0;
+                    #pragma unroll
+                    for (int64_t row = 0; row < 2; ++row) {
+                        #pragma unroll
+                        for (int64_t col = 0; col < 2; ++col) {
+                            T Tpx = utils::get_pixel_clamped(input, inputIdx, c, inXint + col, inYint + row);
+                            opVal += rs[row] * ps[col] * Tpx;
+                        }
+                    }
+                    output[i][c][outY][outX] = opVal;
+                }
+            }
+        }
+    }
+}
+torch::Tensor cpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid,
+                                               torch::Tensor inputIndices, const std::string &method)
+{
+    auto output = input.new_empty({ inputIndices.size(0), input.size(1), grid.size(1), grid.size(2) });
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(),
+        "cpu_indirect_grid_sample_forward_impl",
+        ([&] {
+            typedef scalar_t T;
+            if (method == "bilinear") {
+                indirect_grid_sample_forward_bilinear(
+                    input.accessor<T, 4>(),
+                    grid.accessor<T, 4>(),
+                    inputIndices.accessor<int64_t, 1>(),
+                    output.accessor<T, 4>()
+                );
+            } else {
+                throw std::runtime_error("Unsupported resample method: " + method);
+            }
+        })
+    );
+    return output;
+}

nemo-retriever-ocr/cpp/better_grid_sample/gpu_grid_sample_utils.cuh ADDED Viewed

	@@ -0,0 +1,42 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "../cuda_intellisense.cuh"
+#ifndef __NVCC__
+#include <algorithm>
+#define __device__
+#endif
+namespace utils {
+#ifdef __NVCC__
+template<typename T>
+__device__ __lib_inline__
+T clamp(T val, T minVal, T maxVal)
+{
+    return max(minVal, min(val, maxVal));
+}
+#else
+using std::clamp;
+#endif
+template<typename accessor_t>
+__device__ __lib_inline__
+auto &get_pixel_clamped(accessor_t &inputs,
+                            int64_t n, int64_t c, int64_t x, int64_t y)
+{
+    x = clamp<decltype(x)>(x, 0, inputs.size(3) - 1);
+    y = clamp<decltype(y)>(y, 0, inputs.size(2) - 1);
+    return inputs[n][c][y][x];
+}
+}

nemo-retriever-ocr/cpp/better_grid_sample/gpu_indirect_grid_sample.cu ADDED Viewed

	@@ -0,0 +1,328 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "grid_sample.h"
+#include "../cuda_intellisense.cuh"
+#include "../half_ops.cuh"
+#include "gpu_grid_sample_utils.cuh"
+using namespace std;
+template<typename accessor_t, typename index_t>
+__device__ __lib_inline__
+auto &my_get_pixel_clamped(accessor_t &inputs, index_t x, index_t y)
+{
+    x = utils::clamp(x, 0, inputs.size(1) - 1);
+    y = utils::clamp(y, 0, inputs.size(0) - 1);
+    return inputs[y][x];
+}
+__global__
+void single_ex_grid_sample_bilinear_kernel(const float *pInputImage,
+                                           uint32_t imgHeight, uint32_t imgWidth, uint32_t numChannels,
+                                           const float2 *pGrid,
+                                           uint32_t numGridCells,
+                                           float *pOutputImage)
+{
+    const uint32_t z = blockDim.x * blockIdx.x + threadIdx.x;
+    const uint32_t c = blockDim.y * blockIdx.y + threadIdx.y;
+    if (c >= numChannels || z >= numGridCells) {
+        return;
+    }
+    const uint32_t g = blockIdx.z;
+    const float2 uv = pGrid[g * numGridCells + z];
+    float &outPx = pOutputImage[(g * numChannels + c) * numGridCells + z];
+    if (abs(uv.x) > 1.0f || abs(uv.y) > 1.0f) {
+        outPx = 0.0f;
+    } else {
+        const uint32_t maxX = imgWidth - 1;
+        const uint32_t maxY = imgHeight - 1;
+        const float u = (uv.x + 1.0f) * maxX * 0.5f;
+        const float v = (uv.y + 1.0f) * maxY * 0.5f;
+        // calculate coordinates
+        const float inX = u;
+        const uint32_t inXint = inX;
+        const float inXfrac = inX - inXint;
+        const float inY = v;
+        const uint32_t inYint = inY;
+        const float inYfrac = inY - inYint;
+        const float *pChanImage = pInputImage + c * imgHeight * imgWidth;
+        // By being in this conditional block, we know that u and v are >= 0, which means
+        // that their truncated value is also >= 0. Instead of clamping the value to within the buffer,
+        // we set the multiplication factor to be 0 if the interpolated value is outside the buffer
+        const float ps[] = { 1.0f - inXfrac, inXfrac * (inXint < maxX) };
+        const float rs[] = { 1.0f - inYfrac, inYfrac * (inYint < maxY) };
+        float opVal = 0.0f;
+        #pragma unroll
+        for (uint32_t row = 0; row < 2; ++row) {
+            const float *pRowImage = pChanImage + (inYint + row) * imgWidth;
+            #pragma unroll
+            for (uint32_t col = 0; col < 2; ++col) {
+                const float px = pRowImage[inXint + col];
+                opVal += rs[row] * ps[col] * px;
+            }
+        }
+        outPx = opVal;
+    }
+}
+template<typename T>
+__global__
+void indirect_grid_sample_forward_bilinear_kernel(torch::PackedTensorAccessor32<T, 4> inputs,
+                                                  torch::PackedTensorAccessor32<T, 4> grid,
+                                                  torch::PackedTensorAccessor32<int64_t, 1> inputIndices,
+                                                  torch::PackedTensorAccessor32<T, 4> outputs)
+{
+    static_assert(std::is_same<T, float>::value, "Currently only float32 is supported!");
+    //typedef typename fp_promote<T>::type accum_t;
+    typedef float accum_t;
+    constexpr T NEG_ONE = -1;
+    constexpr T ONE = 1;
+    constexpr T ZERO = 0;
+    constexpr T TWO = 2;
+    constexpr T ZERO_PT_5 = 0.5;
+    typedef decltype(inputs.stride(0)) index_t;
+    const index_t n = blockDim.z * blockIdx.z + threadIdx.z;
+    if (n >= inputIndices.size(0)) return;
+    const index_t c = blockDim.y * blockIdx.y + threadIdx.y;
+    const index_t z = blockDim.x * blockIdx.x + threadIdx.x;
+    const accum_t inputHeight = inputs.size(2);
+    const accum_t inputWidth = inputs.size(3);
+    const index_t outputHeight = outputs.size(2);
+    const index_t outputWidth = outputs.size(3);
+    const index_t outY = z / outputWidth;
+    //const index_t outX = z % outputWidth;
+    const index_t outX = z - (outY * outputWidth);
+    if (outY >= outputHeight) return;
+    index_t inputIdx = inputIndices[n];
+    const float2 f2uv = *reinterpret_cast<const float2*>(grid[n][outY][outX].data());
+    float u = f2uv.x;
+    float v = f2uv.y;
+    if (u < NEG_ONE || u > ONE || v < NEG_ONE || v > ONE) {
+        outputs[n][c][outY][outX] = ZERO;
+        return;
+    }
+    // Denormalize the coordinates
+    u = (u + ONE) * ((inputWidth - ONE) * ZERO_PT_5);
+    v = (v + ONE) * ((inputHeight - ONE) * ZERO_PT_5);
+    // calculate coordinates
+    const accum_t inX = u;
+    const index_t inXint = inX;
+    const accum_t inXfrac = inX - inXint;
+    const accum_t inY = v;
+    const index_t inYint = inY;
+    const accum_t inYfrac = inY - inYint;
+    accum_t ps[] = { ONE - inXfrac, inXfrac };
+    accum_t rs[] = { ONE - inYfrac, inYfrac };
+    accum_t opVal = ZERO;
+    auto localInputs = inputs[inputIdx][c];
+    #pragma unroll
+    for (index_t row = 0; row < 2; ++row) {
+        #pragma unroll
+        for (index_t col = 0; col < 2; ++col) {
+            T Tpx = my_get_pixel_clamped(localInputs, inXint + col, inYint + row);
+            opVal += rs[row] * ps[col] * Convert<T, accum_t>::LeftToRight(Tpx);
+        }
+    }
+    outputs[n][c][outY][outX] = Convert<T, accum_t>::RightToLeft(opVal);
+}
+template<typename T>
+__global__
+void indirect_grid_sample_backward_bilinear_kernel(torch::PackedTensorAccessor64<T, 4> inputs,
+                                                   torch::PackedTensorAccessor64<T, 4> grid,
+                                                   torch::PackedTensorAccessor64<int64_t, 1> inputIndices,
+                                                   torch::PackedTensorAccessor64<T, 4> gradOutput,
+                                                   torch::PackedTensorAccessor64<T, 4> gradInput,
+                                                   torch::PackedTensorAccessor64<T, 4> gradGrid)
+{
+    typedef typename fp_promote<T>::type accum_t;
+    constexpr T NEG_ONE = -1;
+    constexpr T ONE = 1;
+    const int64_t n = blockDim.z * blockIdx.z + threadIdx.z;
+    if (n >= inputIndices.size(0)) return;
+    const int64_t c = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t z = blockDim.x * blockIdx.x + threadIdx.x;
+    const accum_t inputHeight = inputs.size(2);
+    const accum_t inputWidth = inputs.size(3);
+    const int64_t outputHeight = gradOutput.size(2);
+    const int64_t outputWidth = gradOutput.size(3);
+    const int64_t outY = z / outputWidth;
+    const int64_t outX = z % outputWidth;
+    if (outY >= outputHeight) return;
+    int64_t inputIdx = inputIndices[n];
+    const float2 f2uv = *reinterpret_cast<const float2*>(grid[n][outY][outX].data());
+    float u = f2uv.x;
+    float v = f2uv.y;
+    // No output gradient contribution from this position
+    if (u < NEG_ONE || u > ONE || v < NEG_ONE || v > ONE) {
+        return;
+    }
+    // Denormalize the coordinates
+    u = (u + 1) * ((inputWidth - 1) / 2);
+    v = (v + 1) * ((inputHeight - 1) / 2);
+    // calculate coordinates
+    const accum_t inX = u;
+    const accum_t inXint = floor(inX);
+    const accum_t inXfrac = inX - inXint;
+    const accum_t inY = v;
+    const accum_t inYint = floor(inY);
+    const accum_t inYfrac = inY - inYint;
+    accum_t ps[] = { 1 - inXfrac, inXfrac };
+    accum_t rs[] = { 1 - inYfrac, inYfrac };
+    const accum_t gOut = Convert<T, accum_t>::LeftToRight(gradOutput[n][c][outY][outX]);
+    #pragma unroll
+    for (size_t row = 0; row < 2; ++row) {
+        #pragma unroll
+        for (size_t col = 0; col < 2; ++col) {
+            T &gIn = utils::get_pixel_clamped(gradInput, inputIdx, c, inXint + col, inYint + row);
+            T gContrib = Convert<T, accum_t>::RightToLeft(rs[row] * ps[col] * gOut);
+            atomicAdd(&gIn, gContrib);
+        }
+    }
+}
+torch::Tensor gpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    auto output = input.new_empty({ inputIndices.size(0), input.size(1), grid.size(1), grid.size(2) });
+    if (method != "bilinear"s) {
+        throw runtime_error("Only 'bilinear' sampling is currently supported!");
+    }
+    if (input.size(0) == 1 && input.is_contiguous() && grid.is_contiguous()) {
+        uint32_t gridNumCells = grid.size(1) * grid.size(2);
+        dim3 blockDim(32, 3, 1);
+        dim3 gridDim(div_up(gridNumCells, blockDim.x),
+                     div_up(input.size(1), blockDim.y),
+                     div_up(grid.size(0), blockDim.z));
+        single_ex_grid_sample_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+            input.data_ptr<float>(),
+            input.size(2), input.size(3), input.size(1),
+            reinterpret_cast<const float2*>(grid.data_ptr()),
+            gridNumCells,
+            output.data_ptr<float>()
+        );
+    } else {
+        // z is batch idx
+        // y is channel
+        // x is w*h
+        dim3 blockDim(32, 1, 3);
+        dim3 gridDim(div_up(grid.size(1) * grid.size(2), blockDim.x),
+                        div_up(input.size(1), blockDim.y),
+                        div_up(inputIndices.size(0), blockDim.z));
+        indirect_grid_sample_forward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+            input.packed_accessor32<float, 4>(),
+            grid.packed_accessor32<float, 4>(),
+            inputIndices.packed_accessor32<int64_t, 1>(),
+            output.packed_accessor32<float, 4>()
+        );
+    }
+    //AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    //    input.scalar_type(),
+    //    "gpu_indirect_grid_sample_forward",
+    //    ([&] {
+    //        typedef typename remap_half<scalar_t>::type T;
+    //        // typedef scalar_t T;
+    //        if (method == "bilinear") {
+    //            indirect_grid_sample_forward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+    //                input.packed_accessor64<T, 4>(),
+    //                grid.packed_accessor64<T, 4>(),
+    //                inputIndices.packed_accessor64<int64_t, 1>(),
+    //                output.packed_accessor64<T, 4>()
+    //            );
+    //        } else {
+    //            throw runtime_error("Unsupported resample method: " + method);
+    //        }
+    //    })
+    //);
+    return output;
+}
+std::vector<torch::Tensor> gpu_indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    auto gradInput = torch::zeros_like(input);
+    auto gradGrid = torch::zeros_like(grid);
+    // z is batch idx
+    // y is channel
+    // x is w*h
+    dim3 blockDim(32, 1, 1);
+    dim3 gridDim(div_up(grid.size(1) * grid.size(2), blockDim.x),
+                 div_up(input.size(1), blockDim.y),
+                 div_up(inputIndices.size(0), blockDim.z));
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(),
+        "gpu_indirect_grid_sample_backward",
+        ([&] {
+            typedef typename remap_half<scalar_t>::type T;
+            // typedef scalar_t T;
+            if (method == "bilinear") {
+                indirect_grid_sample_backward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+                    input.packed_accessor64<T, 4>(),
+                    grid.packed_accessor64<T, 4>(),
+                    inputIndices.packed_accessor64<int64_t, 1>(),
+                    gradOutput.packed_accessor64<T, 4>(),
+                    gradInput.packed_accessor64<T, 4>(),
+                    gradGrid.packed_accessor64<T, 4>()
+                );
+            } else {
+                throw runtime_error("Unsupported resample method: " + method);
+            }
+        })
+    );
+    return { gradInput, gradGrid };
+}

nemo-retriever-ocr/cpp/better_grid_sample/grid_sample.h ADDED Viewed

	@@ -0,0 +1,67 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+inline
+torch::Tensor region_counts_to_indices(torch::Tensor regionCounts, int64_t numOutputs)
+{
+    // If there's only one example, we can trivially return idx 0 for all
+    if (regionCounts.size(0) == 1) {
+        return torch::zeros({ numOutputs }, regionCounts.options().dtype(torch::kInt64));
+    }
+    // regionCounts will be some tensor like [ 5, 1, 10, 2 ] which means that the first 5 outputs
+    // correspond to the first input, the next output to the second input, 10 to the third, and so on.
+    // We want to convert this to instead have an entry for each output which specifies the index of the corresponding input.
+    // To do this, we can count the number of times the output index exceeds the cumulative input counts.
+    // e.g. the cumulative region count for the above tensor is [ 5, 6, 16, 18 ].
+    // The output indices 0-4 are not greater than or equal to any cumulative count, so they get the input index of 0.
+    // The output index 5 is equal to a single count, therefore index 1.
+    // The outputs 6-15 are all greater than or equal to two cumulative counts, therefore index 2.
+    // And so on.
+    auto indices = torch::arange(regionCounts.size(0), regionCounts.options().dtype(torch::kInt64));
+    auto outputIndices = torch::repeat_interleave(indices, regionCounts, /*dim=*/ 0, /*output_size=*/ numOutputs);
+    return outputIndices;
+}
+torch::Tensor gpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
+torch::Tensor cpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
+std::vector<torch::Tensor> gpu_indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
+inline
+torch::Tensor indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    if (input.is_cuda() != grid.is_cuda() || input.is_cuda() != inputIndices.is_cuda()) {
+        throw std::runtime_error("Input tensors must all be on the same device!");
+    }
+    if (inputIndices.size(0) != grid.size(0)) {
+        throw std::runtime_error("The batch dimensions must match!");
+    }
+    if (grid.size(-1) != 2) {
+        throw std::runtime_error("The final grid dimension must be 2.");
+    }
+    if (input.is_cuda()) {
+        return gpu_indirect_grid_sample_forward(std::move(input), std::move(grid), std::move(inputIndices), method);
+    } else {
+        return cpu_indirect_grid_sample_forward(std::move(input), std::move(grid), std::move(inputIndices), method);
+    }
+}
+inline
+std::vector<torch::Tensor> indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    if (gradOutput.is_cuda()) {
+        return gpu_indirect_grad_sample_backward(std::move(gradOutput), std::move(input), std::move(grid), std::move(inputIndices), method);
+    } else {
+        throw std::runtime_error("Not implemented!");
+    }
+}

nemo-retriever-ocr/cpp/common.cpp ADDED Viewed

	@@ -0,0 +1,13 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "common.h"
+#include <sstream>
+using namespace std;
+void print_tensor(const torch::Tensor &t) {
+    cout << t << endl;
+}

nemo-retriever-ocr/cpp/common.h ADDED Viewed

	@@ -0,0 +1,58 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <ostream>
+#include <vector>
+#include <torch/torch.h>
+template<typename T>
+inline
+std::ostream &operator<<(std::ostream &os, const std::vector<T> &v) {
+    os << "[";
+    if (! v.empty()) {
+        os << v[0];
+        for (size_t i = 1; i < v.size(); ++i) {
+            os << ", " << v[i];
+        }
+    }
+    os << "]";
+    return os;
+}
+template<int Counter, typename ...Args>
+struct _inner_tuple_print
+{
+    inline
+    static std::ostream &print(std::ostream &os, const std::tuple<Args...> &t) {
+        _inner_tuple_print<Counter - 1, Args...>::print(os, t);
+        os << ", " << std::get<Counter>(t);
+        return os;
+    }
+};
+template<typename ...Args>
+struct _inner_tuple_print<0, Args...>
+{
+    inline
+    static std::ostream &print(std::ostream &os, const std::tuple<Args...> &t) {
+        os << std::get<0>(t);
+        return os;
+    }
+};
+template<typename... Args>
+inline
+std::ostream &operator<<(std::ostream &os, const std::tuple<Args...> &t) {
+    os << "(";
+    _inner_tuple_print<sizeof...(Args) - 1, Args...>::print(os, t);
+    os << ")";
+    return os;
+}
+void print_tensor(const torch::Tensor &t);

nemo-retriever-ocr/cpp/cuda_intellisense.cuh ADDED Viewed

	@@ -0,0 +1,51 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#if defined(__INTELLISENSE__) || !defined(__NVCC__)
+#ifndef KERNEL_ARG2
+#define KERNEL_ARG2(grid, block)
+#define KERNEL_ARG3(grid, block, sh_mem)
+#define KERNEL_ARG4(grid, block, sh_mem, stream)
+#define __global__
+#define __device__
+#define __host__
+#endif
+#endif
+#ifdef __INTELLISENSE__
+#define __CUDACC__
+#include <cuda_runtime.h>
+void __syncthreads();  // workaround __syncthreads warning
+dim3 threadIdx;
+dim3 blockIdx;
+dim3 blockDim;
+dim3 gridDim;
+#else
+#ifndef KERNEL_ARG2
+#define KERNEL_ARG2(grid, block) <<< grid, block >>>
+#define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
+#define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem, stream >>>
+#endif
+#endif
+#define __any_device__ __host__ __device__
+#ifdef __NVCC__
+#define __lib_inline__ __forceinline__
+#else
+#define __lib_inline__ inline
+#endif
+template<typename T1, typename T2>
+__any_device__
+inline auto div_up(T1 n, T2 d)
+{
+    return (n + d - 1) / d;
+}

nemo-retriever-ocr/cpp/geometry.h ADDED Viewed

	@@ -0,0 +1,1101 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <type_traits>
+#ifndef _GEOMETRY_NO_TORCH
+#include <torch/torch.h>
+#endif
+#include "cuda_intellisense.cuh"
+#ifndef __NVCC__
+#define SORT_ALGO std::sort
+#define SWAP std::swap
+template<typename ...Args>
+using tuple_t = std::tuple<Args...>;
+#else
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+#define SORT_ALGO thrust::sort
+#define SWAP thrust::swap
+template<typename ...Args>
+using tuple_t = thrust::tuple<Args...>;
+#endif
+template<typename T>
+struct Point_ {
+    typedef T inner_type;
+    T X, Y;
+    Point_() = default;
+    __any_device__
+    Point_(T x, T y) : X(x), Y(y) {}
+    __any_device__
+    Point_(T *ptr) : X(ptr[0]), Y(ptr[1]) {}
+#ifndef _GEOMETRY_NO_TORCH
+    template<typename T2>
+    __any_device__
+    Point_(const torch::TensorAccessor<T2, 1> &accessor) : X(accessor[0]), Y(accessor[1]) {}
+    template<typename T2>
+    __any_device__
+    Point_(const torch::PackedTensorAccessor64<T2, 1> &accessor) : X(accessor[0]), Y(accessor[1]) {}
+#endif
+    __any_device__
+    Point_ &operator+=(const Point_ &other);
+    __any_device__
+    Point_ &operator-=(const Point_ &other);
+    __any_device__
+    Point_ &operator*=(const Point_ &other);
+    __any_device__
+    Point_ &operator/=(const Point_ &other);
+    template<typename W>
+    __any_device__
+    Point_ &operator/=(W w);
+    template<typename W>
+    __any_device__
+    Point_ &operator*=(W w);
+    __any_device__
+    Point_ operator-() {
+        return { -X, -Y };
+    }
+    __any_device__
+    T Sum() const { return X + Y; }
+    __any_device__
+    T Angle() const;
+    __any_device__
+    void swap(Point_ &other) noexcept {
+        SWAP(X, other.X);
+        SWAP(Y, other.Y);
+    }
+};
+template<typename T>
+__lib_inline__ __any_device__
+void swap(Point_<T> &a, Point_<T> &b) {
+    a.swap(b);
+}
+template<typename T>
+__any_device__
+__lib_inline__ T Point_<T>::Angle() const {
+#ifndef __NVCC__
+    using std::atan2;
+#endif
+    return atan2(Y, X);
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> min(const Point_<T> &a, const Point_<T> &b) {
+#ifndef __NVCC__
+    using std::min;
+#endif
+    return {
+        min(a.X, b.X),
+        min(a.Y, b.Y)
+    };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> max(const Point_<T> &a, const Point_<T> &b) {
+#ifndef __NVCC__
+    using std::max;
+#endif
+    return {
+        max(a.X, b.X),
+        max(a.Y, b.Y)
+    };
+}
+template<typename T>
+struct AABB_ {
+    typedef T inner_type;
+    T X;
+    T Y;
+    T MaxX;
+    T MaxY;
+    AABB_() = default;
+    __any_device__
+    AABB_(T x, T y, T maxX, T maxY)
+        : X(x), Y(y), MaxX(maxX), MaxY(maxY) {}
+    __any_device__
+    bool Contains(const Point_<T> &p) const {
+        return p.X >= X && p.X < MaxX &&
+               p.Y >= Y && p.Y < MaxY;
+    }
+    __any_device__ __lib_inline__
+    AABB_ Union(const AABB_ &other) const {
+#ifndef __NVCC__
+        using std::min;
+        using std::max;
+#endif
+        T minX = min(X, other.X);
+        T maxX = max(MaxX, other.MaxX);
+        T minY = min(Y, other.Y);
+        T maxY = max(MaxY, other.MaxY);
+        return { minX, minY, maxX, maxY };
+    }
+    __any_device__
+    AABB_ &operator-=(const Point_<T> &offset) {
+        X -= offset.X;
+        MaxX -= offset.X;
+        Y -= offset.Y;
+        MaxY -= offset.Y;
+        return *this;
+    }
+    __any_device__
+    __lib_inline__ T Width() const { return MaxX - X; }
+    __any_device__
+    __lib_inline__ T Height() const { return MaxY - Y; }
+    __any_device__
+    __lib_inline__ T Area() const { return Width() * Height(); }
+    __lib_inline__ T &operator[] (int64_t idx)
+    {
+        static_assert(std::is_standard_layout<AABB_<T>>::value, "This function is only valid for standard layout");
+        return (&X)[idx];
+    }
+    __lib_inline__ T operator[] (int64_t idx) const
+    {
+        static_assert(std::is_standard_layout<AABB_<T>>::value, "This function is only valid for standard layout");
+        return (&X)[idx];
+    }
+    __any_device__ __lib_inline__
+    AABB_ Intersection(const AABB_ &other) const {
+#ifndef __NVCC__
+        using std::min;
+        using std::max;
+#endif
+        T minX = max(X, other.X);
+        T minY = max(Y, other.Y);
+        T maxX = min(MaxX, other.MaxX);
+        T maxY = min(MaxY, other.MaxY);
+        // Prevent negative area
+        minX = min(minX, maxX);
+        minY = min(minY, maxY);
+        return { minX, minY, maxX, maxY };
+    }
+    __any_device__ __lib_inline__
+    T IntersectionArea(const AABB_ &other) const { return Intersection(other).Area(); }
+};
+template<typename T, typename Derived>
+struct QuadBase_ {
+    typedef T inner_type;
+    __any_device__
+    AABB_<T> Bounds() const;
+    __any_device__
+    bool Contains(const Point_<T> &p) const;
+    __any_device__
+    T Area() const;
+    __any_device__
+    T Height() const;
+    __any_device__
+    T Width() const;
+    template<typename Derived2>
+    __any_device__
+    T IntersectionArea(const QuadBase_<T, Derived2> &other) const;
+    template<typename Derived2>
+    __any_device__
+    T IOU(const QuadBase_<T, Derived2> &other) const;
+    template<typename Derived2>
+    __any_device__
+    T IOU_UpperBound(const QuadBase_<T, Derived2> &other) const;
+    __any_device__
+    Point_<T> Center() const;
+    template<typename Derived2>
+    __any_device__
+    /*
+        Returns 3 geometric associations between the two quads:
+            0: The percent shared area between this and other relative to this (e.g. if other contains this, then it returns 1)
+            1: The percent shared area between other and this relative to other (e.g. if this contains other, then it return 1)
+            2: The IOU of the two quads
+    */
+    tuple_t<T, T, T> RegionSizes(const QuadBase_<T, Derived2> &other) const;
+    template<typename Derived2>
+    __any_device__
+    tuple_t<T, T, T> RegionSizes_UpperBound(const QuadBase_<T, Derived2> &other) const;
+    __any_device__
+    Derived &operator/=(T val) {
+        auto rcp = 1 / val;
+        return *this *= rcp;
+    }
+    __any_device__
+    Derived &operator*=(T val) {
+        auto dThis = static_cast<Derived*>(this);
+        #pragma unroll
+        for (size_t i = 0; i < 4; ++i) {
+            dThis->Vertices[i] *= val;
+        }
+        return *dThis;
+    }
+    friend auto begin(const QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices; }
+    friend auto begin(QuadBase_& q) { return static_cast<const Derived&>(q).Vertices; }
+    friend auto end(const QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices + 4; }
+    friend auto end(QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices + 4; }
+};
+template<typename T>
+struct Quad_ : QuadBase_<T, Quad_<T>> {
+    Point_<T> *Vertices = nullptr;
+    Quad_() = default;
+    __any_device__
+    Quad_(T *dataPtr)
+        : Vertices(reinterpret_cast<Point_<T>*>(dataPtr)) {}
+    __any_device__
+    Quad_(Point_<T> *dataPtr)
+        : Vertices(dataPtr) {}
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    const Point_<T> &operator[](index_t offset) const { return Vertices[offset]; }
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    Point_<T> &operator[](index_t offset) { return Vertices[offset]; }
+};
+template<typename T>
+struct InPlaceQuad_ : public QuadBase_<T, InPlaceQuad_<T>> {
+    Point_<T> Vertices[4];
+    InPlaceQuad_() = default;
+    __any_device__
+    InPlaceQuad_(const T *dataPtr)
+    {
+#if defined(__NVCC__)
+        T *pVals = reinterpret_cast<T*>(Vertices);
+        #pragma unroll
+        for (uint32_t i = 0; i < 8; ++i) {
+            pVals[i] = dataPtr[i];
+        }
+#else
+        using std::copy;
+        copy(dataPtr, dataPtr + 8, reinterpret_cast<T*>(Vertices));
+#endif
+    }
+    __any_device__
+    InPlaceQuad_(const Point_<T> *dataPtr)
+    {
+#if defined(__NVCC__)
+        #pragma unroll
+        for (uint32_t i = 0; i < 4; ++i) {
+            Vertices[i] = dataPtr[i];
+        }
+#else
+        using std::copy;
+        copy(dataPtr, dataPtr + 4, Vertices);
+#endif
+    }
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    const Point_<T> &operator[](index_t v) const { return Vertices[v]; }
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    Point_<T> &operator[](index_t v) { return Vertices[v]; }
+};
+template<typename T, typename Derived>
+struct PolygonBase_ {
+    typedef T inner_type;
+    __any_device__
+    AABB_<T> Bounds() const;
+    __any_device__
+    bool Contains(const Point_<T> &p) const;
+    __any_device__
+    T EdgeLength() const;
+    __any_device__
+    Point_<T> Center() const;
+    __any_device__
+    T Area() const;
+};
+template<typename T>
+struct Polygon_ : PolygonBase_<T, Polygon_<T>> {
+    Point_<T> *Vertices = nullptr;
+    size_t Count = 0;
+    Polygon_() = default;
+    __any_device__
+    Polygon_(T *dataPtr, size_t vertexCount)
+        : Vertices(reinterpret_cast<Point_<T>*>(dataPtr)), Count(vertexCount) {}
+    __any_device__
+    Polygon_(Point_<T> *dataPtr, size_t vertexCount)
+        : Vertices(dataPtr), Count(vertexCount) {}
+    __any_device__
+    const Point_<T> &operator[](size_t offset) const { return Vertices[offset]; }
+    __any_device__
+    Point_<T> &operator[](size_t offset) { return Vertices[offset]; }
+};
+template<typename T>
+struct Segment_ {
+    Point_<T> A, B;
+    Segment_() = default;
+    __any_device__
+    Segment_(const Point_<T> &a, const Point_<T> &b) : A(a), B(b) {}
+    __any_device__
+    T Length() const;
+    __any_device__
+    T LengthSq() const;
+    __any_device__
+    bool Intersection(const Segment_<T> &other, Point_<T> &out_ptAlong) const;
+};
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> operator+(const Point_<T> &a, const Point_<T> &b) {
+    return { a.X + b.X, a.Y + b.Y };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> operator-(const Point_<T> &a, const Point_<T> &b) {
+    return { a.X - b.X, a.Y - b.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator*(W scale, const Point_<T> &p) {
+    return { scale * p.X, scale * p.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator*(const Point_<T> &p, W scale) {
+    return { scale * p.X, scale * p.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator/(const Point_<T> &p, W divisor) {
+    return { p.X / divisor, p.Y / divisor };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> operator*(const Point_<T> &a, const Point_<T> &b) {
+    return { a.X * b.X, a.Y * b.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator-(const Point_<T> &p, W v) {
+    return { p.X - v, p.Y - v };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator+=(const Point_<T> &p) {
+    X = X + p.X;
+    Y = Y + p.Y;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator-=(const Point_<T> &p) {
+    X = X - p.X;
+    Y = Y - p.Y;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator*=(const Point_<T> &p) {
+    X = X * p.X;
+    Y = Y * p.Y;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator/=(const Point_<T> &p) {
+    X = X / p.X;
+    Y = Y / p.Y;
+    return *this;
+}
+template<typename T>
+template<typename W>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator/=(W val) {
+    // TODO: This can be more efficient for float types by computing the reciprocal
+    X /= val;
+    Y /= val;
+    return *this;
+}
+template<typename T>
+template<typename W>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator*=(W val) {
+    X *= val;
+    Y *= val;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ T dot(const Point_<T> &a, const Point_<T> &b) {
+    return a.X * b.X + a.Y * b.Y;
+}
+template<typename T>
+__any_device__
+__lib_inline__ T dot(const Point_<T> &p) {
+    return dot(p, p);
+}
+template<typename T>
+__any_device__
+__lib_inline__ T length(const Point_<T> &p) {
+#ifndef __NVCC__
+    using std::sqrt;
+#endif
+    return sqrt(dot(p));
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> normalize(const Point_<T> &p) {
+    static constexpr T epsilon = std::numeric_limits<T>::epsilon();
+    auto len = length(p) + epsilon;
+    return { p.X / len, p.Y / len };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> ortho_2d(const Point_<T> &p) {
+    return { -p.Y, p.X };
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const Point_<T> &p) {
+    return os << "(" << p.X << ", " << p.Y << ")";
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const AABB_<T> &b) {
+    return os << "[(" << b.X << ", " << b.Y << "), (" << b.MaxX << ", " << b.MaxY << ")]";
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const Segment_<T> &s) {
+    return os << "[(" << s.A.X << ", " << s.A.Y << "), (" << s.B.X << ", " << s.B.Y << ")]";
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const Quad_<T> &q) {
+    os << "[" << q.Vertices[0];
+    for (size_t i = 1; i < 4; ++i) {
+        os << ", " << q.Vertices[i];
+    }
+    return os << "]";
+}
+template<typename T>
+__any_device__
+__lib_inline__ int _signum(T val) {
+    return (T(0) < val) - (val < T(0));
+}
+template<typename T>
+__any_device__
+__lib_inline__ T sign(const Point_<T> &p1, const Point_<T> &p2, const Point_<T> &p3) {
+    T ret = (p1.X - p3.X) * (p2.Y - p3.Y) - (p2.X - p3.X) * (p1.Y - p3.Y);
+    auto sgn = _signum(ret);
+    return sgn;
+}
+template<typename T>
+__any_device__
+__lib_inline__ T Segment_<T>::Length() const
+{
+#ifndef __NVCC__
+    using std::sqrt;
+#endif
+    return sqrt(LengthSq());
+}
+template<typename T>
+__any_device__
+__lib_inline__ T Segment_<T>::LengthSq() const
+{
+    return dot(B - A);
+}
+template<typename T>
+__any_device__
+inline bool Segment_<T>::Intersection(const Segment_<T> &other, Point_<T> &out_ptAlong) const
+{
+    auto p1 = A, p2 = B, p3 = other.A, p4 = other.B;
+    auto denom = (p4.Y - p3.Y) * (p2.X - p1.X) - (p4.X - p3.X) * (p2.Y - p1.Y);
+    if (abs(denom) < 1e-8) {
+        return false;
+    }
+    auto numer = (p4.X - p3.X) * (p1.Y - p3.Y) - (p4.Y - p3.Y) * (p1.X - p3.X);
+    auto t = numer / denom;
+    auto Bnumer = (p2.X - p1.X) * (p1.Y - p3.Y) - (p2.Y - p1.Y) * (p1.X - p3.X);
+    auto Bt = Bnumer / denom;
+    if (t < 0 || t > 1 || Bt < 0 || Bt > 1) {
+        return false;
+    }
+    out_ptAlong = A + t * (B - A);
+    return true;
+}
+template<typename quad_t>
+__any_device__
+auto quad_center(const quad_t &quad) -> Point_<typename quad_t::inner_type>
+{
+    typedef typename quad_t::inner_type T;
+    Point_<T> center = quad[0];
+    for (size_t i = 1; i < 4; ++i) {
+        center += quad[i];
+    }
+    return center / T{ 4 };
+}
+template<typename T, typename Derived>
+__any_device__
+Point_<T> QuadBase_<T, Derived>::Center() const {
+    return quad_center(static_cast<const Derived&>(*this));
+}
+template<typename quad_t>
+__any_device__
+auto quad_bounds(const quad_t &quad) -> AABB_<typename quad_t::inner_type>
+{
+#ifndef __NVCC__
+    using std::min;
+    using std::max;
+#endif
+    auto minP = quad[0];
+    auto maxP = minP;
+    for (size_t i = 1; i < 4; ++i) {
+        auto qp = quad[i];
+        minP = min(minP, qp);
+        maxP = max(maxP, qp);
+    }
+    return { minP.X, minP.Y, maxP.X, maxP.Y };
+}
+template<typename T, typename Derived>
+__any_device__
+AABB_<T> QuadBase_<T, Derived>::Bounds() const {
+    return quad_bounds(static_cast<const Derived&>(*this));
+}
+template<typename Quad_t, typename point_t>
+__any_device__
+inline bool quad_contains(const Quad_t &quad, const point_t &pt)
+{
+#ifndef __NVCC__
+    using std::abs;
+#endif
+    // Checks that the point lies on the interior side of each half plane
+    auto d1 = sign(pt, quad[0], quad[1]);
+    auto d2 = sign(pt, quad[1], quad[2]);
+    auto d3 = sign(pt, quad[2], quad[3]);
+    auto d4 = sign(pt, quad[3], quad[0]);
+    // bool has_neg = (d1 < 0) || (d2 < 0) || (d3 < 0) || (d4 < 0);
+    // bool has_pos = (d1 > 0) || (d2 > 0) || (d3 > 0) || (d4 > 0);
+    int tot = d1 + d2 + d3 + d4;
+    // return !(has_neg && has_pos);
+    return abs(tot) == 4;
+}
+template<typename T, typename Derived>
+__any_device__
+__lib_inline__ bool QuadBase_<T, Derived>::Contains(const Point_<T> &pt) const
+{
+    return quad_contains(static_cast<const Derived&>(*this), pt);
+}
+template<typename PtList>
+__any_device__
+inline auto shoelace_area(const PtList &points, size_t numPts, bool isSigned=false) -> decltype(points[0].X)
+{
+#ifndef __NVCC__
+    using std::abs;
+#endif
+    decltype(points[0].X) area = 0;
+    size_t j = numPts - 1;
+    for (size_t i = 0; i < numPts; ++i) {
+        auto Pi = points[i];
+        auto Pj = points[j];
+        area += (Pj.X + Pi.X) * (Pj.Y - Pi.Y);
+        j = i;
+    }
+    area = area / 2;
+    if (! isSigned) {
+        area = abs(area);
+    }
+    return area;
+}
+template<typename T, typename Derived>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::Height() const
+{
+    auto &d = static_cast<const Derived&>(*this);
+    auto h1 = Segment_<T>(d[1], d[2]).Length();
+    auto h2 = Segment_<T>(d[3], d[0]).Length();
+    return (h1 + h2) / 2;
+}
+template<typename T, typename Derived>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::Width() const
+{
+    auto &d = static_cast<const Derived&>(*this);
+    auto w1 = Segment_<T>(d[0], d[1]).Length();
+    auto w2 = Segment_<T>(d[3], d[2]).Length();
+    return (w1 + w2) / 2;
+}
+// A quad can be defined as the sum of the area of two triangles
+template<typename T, typename Derived>
+__any_device__
+inline T QuadBase_<T, Derived>::Area() const
+{
+    // auto vertices = static_cast<const Derived *>(this)->Vertices;
+    return shoelace_area(static_cast<const Derived&>(*this), 4);
+}
+template<typename Quad_t1, typename Quad_t2>
+__any_device__
+inline auto intersection_area(const Quad_t1 &quadsA, const Quad_t2 &quadsB) -> typename Quad_t1::inner_type
+{
+#ifndef __NVCC__
+    using std::atan2;
+#endif
+    typedef typename Quad_t1::inner_type T;
+    static const size_t MAX_PTS = 32;
+    Point_<T> points[MAX_PTS], sortedPoints[MAX_PTS];
+    T angles[MAX_PTS];
+    size_t indices[MAX_PTS];
+    size_t numPts = 0;
+    auto addPt = [&] (const Point_<T> &p) {
+        points[numPts] = p;
+        ++numPts;
+    };
+    for (size_t i = 0; i < 4; ++i) {
+        Point_<T> aPt = quadsA[i];
+        Point_<T> bPt = quadsB[i];
+        if (quadsA.Contains(bPt)) {
+            addPt(bPt);
+        }
+        if (quadsB.Contains(aPt)) {
+            addPt(aPt);
+        }
+    }
+    for (size_t i = 0; i < 4; ++i) {
+        Segment_<T> segA{ quadsA[i], quadsA[(i + 1) % 4] };
+        for (size_t j = 0; j < 4; ++j) {
+            Segment_<T> segB{ quadsB[j], quadsB[(j + 1) % 4] };
+            Point_<T> ptAlong;
+            if (segA.Intersection(segB, ptAlong)) {
+                addPt(ptAlong);
+            }
+        }
+    }
+    if (numPts == 0) {
+        return 0;
+    }
+    Point_<T> center{ 0, 0 };
+    for (size_t i = 0; i < numPts; ++i) {
+        center += points[i];
+    }
+    center /= numPts;
+    for (size_t i = 0; i < numPts; ++i) {
+        points[i] -= center;
+        angles[i] = atan2(points[i].Y, points[i].X);
+        indices[i] = i;
+    }
+    // Perform an argsort over the angles
+    SORT_ALGO(indices, indices + numPts,
+        [&] (size_t a, size_t b) {
+            return angles[a] < angles[b];
+        }
+    );
+    for (size_t i = 0; i < numPts; ++i) {
+        sortedPoints[i] = points[indices[i]];
+    }
+    // Finally, we can compute the area of this polygon using the shoelace formula
+    T area = shoelace_area(sortedPoints, numPts);
+    return area;
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::IntersectionArea(const QuadBase_<T, Derived2> &other) const
+{
+    return intersection_area(
+        static_cast<const Derived&>(*this),
+        static_cast<const Derived2&>(other)
+    );
+}
+template<typename T1, typename T2>
+__any_device__
+__lib_inline__ auto geometry_iou(const T1 &a, const T2 &b) -> decltype(a.Area())
+{
+    auto aArea = a.Area();
+    auto bArea = b.Area();
+    auto ixArea = a.IntersectionArea(b);
+    auto unionArea = aArea + bArea - ixArea;
+    return ixArea / unionArea;
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::IOU(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_iou(
+        static_cast<const Derived&>(*this),
+        static_cast<const Derived2&>(other)
+    );
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::IOU_UpperBound(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_iou(
+        Bounds(),
+        other.Bounds()
+    );
+}
+template<typename T1, typename T2>
+__any_device__ __lib_inline__
+auto geometry_region_sizes(const T1 &a, const T2 &b) -> tuple_t<decltype(a.Area()), decltype(a.Area()), decltype(a.IntersectionArea(b))>
+{
+    auto aArea = a.Area();
+    auto bArea = b.Area();
+    auto ixArea = a.IntersectionArea(b);
+    auto unionArea = aArea + bArea - ixArea;
+    auto iou = ixArea / unionArea;
+    return { ixArea / aArea, ixArea / bArea, iou };
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__ __lib_inline__
+tuple_t<T, T, T> QuadBase_<T, Derived>::RegionSizes(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_region_sizes(
+        static_cast<const Derived&>(*this),
+        static_cast<const Derived2&>(other)
+    );
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__ __lib_inline__
+tuple_t<T, T, T> QuadBase_<T, Derived>::RegionSizes_UpperBound(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_region_sizes(
+        Bounds(),
+        other.Bounds()
+    );
+}
+template<typename polygon_t>
+__any_device__
+auto polygon_bounds(const polygon_t &poly) -> AABB_<typename polygon_t::inner_type>
+{
+#ifndef __NVCC__
+    using std::min;
+    using std::max;
+#endif
+    auto minP = poly[0];
+    auto maxP = minP;
+    for (size_t i = 1; i < poly.Count; ++i) {
+        auto qp = poly[i];
+        minP = min(minP, qp);
+        maxP = max(maxP, qp);
+    }
+    return { minP.X, minP.Y, maxP.X, maxP.Y };
+}
+template<typename T, typename Derived>
+__any_device__
+AABB_<T> PolygonBase_<T, Derived>::Bounds() const {
+    return polygon_bounds(static_cast<const Derived&>(*this));
+}
+template<typename polygon_t, typename point_t>
+__any_device__
+bool polygon_contains(const polygon_t &poly, const point_t &pt)
+{
+    typedef typename polygon_t::inner_type T;
+    // Some arbitrary segment. Technically this should be a ray, but functionally this will work
+    Segment_<T> testSeg{ pt, { -1e6, -2e6 }};
+    Point_<T> trash;
+    int32_t ixCount = 0;
+    for (size_t i = 0; i < poly.Count; ++i) {
+        Segment_<T> polySeg{ poly[i], poly[(i + 1) % poly.Count] };
+        if (testSeg.Intersection(polySeg, trash)) {
+            ++ixCount;
+        }
+    }
+    // If there are an odd number of intersections, then the point is inside
+    return (ixCount % 2) == 1;
+}
+template<typename T, typename Derived>
+__any_device__
+bool PolygonBase_<T, Derived>::Contains(const Point_<T> &pt) const {
+    return polygon_contains(static_cast<const Derived&>(*this), pt);
+}
+template<typename polygon_t>
+__any_device__
+auto polygon_edge_length(const polygon_t &poly) -> typename polygon_t::inner_type
+{
+    typedef typename polygon_t::inner_type T;
+    T ret = 0;
+    for (size_t i = 0; i < poly.Count; ++i) {
+        Segment_<T> seg{ poly[i], poly[(i + 1) % poly.Count] };
+        ret += seg.Length();
+    }
+    return ret;
+}
+template<typename T, typename Derived>
+__any_device__
+T PolygonBase_<T, Derived>::EdgeLength() const {
+    return polygon_edge_length(static_cast<const Derived&>(*this));
+}
+template<typename polygon_t>
+__any_device__
+auto polygon_center(const polygon_t &poly) -> Point_<typename polygon_t::inner_type>
+{
+    typedef typename polygon_t::inner_type T;
+    T cx = 0, cy = 0, a = 0;
+    size_t j = poly.Count - 1;
+    for (size_t i = 0; i < poly.Count; ++i) {
+        Point_<T> p0 = poly[i];
+        Point_<T> p1 = poly[j];
+        T common = (p0.X * p1.Y - p1.X * p0.Y);
+        cx += (p0.X + p1.X) * common;
+        cy += (p0.Y + p1.Y) * common;
+        a += common;
+        j = i;
+    }
+    a /= 2;
+    Point_<T> center{ cx / (6 * a), cy / (6 * a) };
+    return center;
+}
+template<typename T, typename Derived>
+__any_device__
+Point_<T> PolygonBase_<T, Derived>::Center() const {
+    return polygon_center(static_cast<const Derived&>(*this));
+}
+template<typename T, typename Derived>
+__any_device__
+T PolygonBase_<T, Derived>::Area() const {
+    const Derived &dThis = static_cast<const Derived&>(*this);
+    return shoelace_area(dThis, dThis.Count);
+}
+template<typename T>
+__any_device__
+Point_<T> nearest_point_on_segment(const Point_<T> &pt, const Segment_<T> &seg)
+{
+#ifndef __NVCC__
+    using std::max;
+    using std::min;
+#endif
+    const T l2 = seg.LengthSq();
+    if (l2 == 0.0) {
+        return seg.A;
+    }
+    const auto v = seg.A;
+    const auto w = seg.B;
+    // Consider the line extending the segment, parameterized as v + t*(w-v)
+    // Find projection of point p onto the line
+    auto t = dot(pt - v, w - v) / l2;
+    // Clamp between t=0 and t=1
+    t = max(static_cast<T>(0), min(static_cast<T>(1), t));
+    const auto projection = v + t * (w - v);
+    return projection;
+}
+template<typename T>
+__any_device__
+Segment_<T> shortest_line_between_segments(const Segment_<T> &a, const Segment_<T> &b)
+{
+    Segment_<T> segs[] = {
+        { a.A, nearest_point_on_segment(a.A, b) },
+        { a.B, nearest_point_on_segment(a.B, b) },
+        { nearest_point_on_segment(b.A, a), b.A },
+        { nearest_point_on_segment(b.B, a), b.B }
+    };
+    T minDist = std::numeric_limits<T>::max();
+    size_t idx;
+    #pragma unroll
+    for (size_t i = 0; i < 4; ++i) {
+        T dist = segs[i].LengthSq();
+        if (dist < minDist) {
+            minDist = dist;
+            idx = i;
+        }
+    }
+    return segs[idx];
+}
+// Find the distance between a point and the nearest point along the specified segment
+template<typename T>
+__any_device__
+T distance_to_segment(const Point_<T> &pt, const Segment_<T> &seg)
+{
+    auto projection = nearest_point_on_segment(pt, seg);
+    auto dist = length(pt - projection);
+    return dist;
+}

nemo-retriever-ocr/cpp/geometry_api/calc_poly_min_rrect.cpp ADDED Viewed

	@@ -0,0 +1,165 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "../graph_detection/encode_util.h"
+#include "../geometry.h"
+#include "matrix2x2.h"
+using namespace std;
+template<typename T>
+void _calc_poly_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect);
+template<typename T>
+void _calc_quad_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect);
+torch::Tensor calc_poly_min_rrect(torch::Tensor vertices)
+{
+    if (vertices.size(0) < 3) {
+        throw runtime_error("Invalid polygon! Expected >= 3 vertices, got " + to_string(vertices.size(0)));
+    }
+    auto ret = torch::empty({ 4, 2 }, vertices.options());
+    auto retAcc = ret.accessor<float, 2>();
+    if (vertices.size(0) != 4) {
+        // OpenCV requires this to be a contiguous buffer
+        vertices = vertices.contiguous();
+        _calc_poly_min_rrect(vertices.accessor<float, 2>(), retAcc);
+    } else {
+        _calc_quad_min_rrect(vertices.accessor<float, 2>(), retAcc);
+    }
+    return ret;
+}
+template<typename T>
+void _calc_bounds(const torch::TensorAccessor<T, 2> &vertices, torch::TensorAccessor<T, 2> &outRRect,
+                  const Point_<T> &leftCenter, const Point_<T> &rightCenter)
+{
+    typedef Point_<T> Pointf;
+    Pointf vecAlong = rightCenter - leftCenter;
+    auto alongMag = length(vecAlong);
+    if (alongMag == 0.0f) {
+        throw runtime_error("Invalid polygon!");
+    }
+    vecAlong /= alongMag;
+    Pointf dOrtho{ -vecAlong.Y, vecAlong.X };
+    Pointf center = (leftCenter + rightCenter) / 2.0f;
+    Matrix2x2<T> rotMat{ vecAlong, dOrtho };
+    auto get_fn = [&vertices, &center] (int64_t i) {
+        return Pointf{ vertices[i] } - center;
+    };
+    // All we care about it getting the bounds in the normalized space, so this saves
+    // us from having to do any memory allocation
+    Pointf minPt{ 0, 0 }, maxPt{ 0, 0 };
+    auto tx_fn = [&minPt, &maxPt] (int64_t i, const Pointf &pt) {
+        minPt = min(minPt, pt);
+        maxPt = max(maxPt, pt);
+    };
+    matmul_fn(vertices.size(0), get_fn, rotMat, tx_fn, transpose_tag{});
+    Pointf rotBox[4] = {
+        minPt,
+        { maxPt.X, minPt.Y },
+        maxPt,
+        { minPt.X, maxPt.Y }
+    };
+    auto get_fn2 = [&rotBox] (int64_t i) {
+        return rotBox[i];
+    };
+    auto assign_fn = [&center, &outRRect] (int64_t i, const Pointf &pt) {
+        outRRect[i][0] = pt.X + center.X;
+        outRRect[i][1] = pt.Y + center.Y;
+    };
+    matmul_fn(4, get_fn2, rotMat, assign_fn, contiguous_tag{});
+}
+template<typename T>
+void _calc_poly_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect)
+{
+    typedef Point_<T> Pointf;
+    typedef Polygon_<T> Polygonf;
+    Polygonf poly{ vertices.data(), vertices.size(0) };
+    vector<graph_detection::Edge> bottoms = graph_detection::find_bottom(poly, false);
+    if (bottoms.size() != 2) {
+        throw runtime_error("Invalid polygon!");
+    }
+    vector<graph_detection::Edge> longEdges[2];
+    graph_detection::find_long_edges(poly, bottoms.data(), longEdges[0], longEdges[1]);
+    ////
+    // Determine which edge is above the other
+    Pointf cpts[2];
+    for (size_t i = 0; i < 2; ++i) {
+        auto &pedge = longEdges[i];
+        cpts[i] = Pointf{0.0f, 0.0f};
+        float ct = 0;
+        for (size_t z = 0; z < pedge.size(); ++z) {
+            auto edge = pedge[z];
+            Pointf p1 = poly[edge.A];
+            Pointf p2 = poly[edge.B];
+            cpts[i] += (p1 + p2) / 2.0f;
+            ct += 1.0f;
+        }
+        if (ct < 1.0f) {
+            throw runtime_error("Edge was empty!");
+        }
+        cpts[i] /= ct;
+    }
+    float vpp = graph_detection::vector_sin(cpts[0] - cpts[1]);
+    if (vpp >= 0) {
+        swap(bottoms[0], bottoms[1]);
+    }
+    ////
+    Pointf edge1[2] = { poly[bottoms[0].A], poly[bottoms[0].B] };
+    Pointf edge2[2] = { poly[bottoms[1].A], poly[bottoms[1].B] };
+    Pointf c0 = (edge1[0] + edge1[1]) / 2.0f;
+    Pointf c1 = (edge2[0] + edge2[1]) / 2.0f;
+    _calc_bounds(vertices, outRRect, c0, c1);
+}
+template<typename T>
+void _calc_quad_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect)
+{
+    typedef Point_<T> Pointf;
+    // Instead of finding an arbitrary rotated box, find a reasonable
+    // fit for the quadrangle
+    Pointf pts[4] = {
+        vertices[0], vertices[1], vertices[2], vertices[3]
+    };
+    Pointf c0 = (pts[0] + pts[3]) / 2.0f;
+    Pointf c1 = (pts[1] + pts[2]) / 2.0f;
+    _calc_bounds(vertices, outRRect, c0, c1);
+}

nemo-retriever-ocr/cpp/geometry_api/geometry_api.cpp ADDED Viewed

	@@ -0,0 +1,101 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "geometry_api_common.h"
+using namespace std;
+torch::Tensor rrect_to_quads_gpu(torch::Tensor rrects, float cellSize);
+template<typename T>
+torch::Tensor rrect_to_quads_impl(torch::Tensor rrects, T cellSize)
+{
+    // BHW(5)
+    auto rrectAccess = rrects.accessor<T, 4>();
+    T cellOff = cellSize / 2;
+    auto quads = torch::empty({ rrects.size(0), rrects.size(1), rrects.size(2), 4, 2 }, rrects.options());
+    auto quadsAccess = quads.accessor<T, 5>();
+    for (long b = 0; b < rrects.size(0); ++b) {
+        for (long y = 0; y < rrects.size(1); ++y) {
+            for (long x = 0; x < rrects.size(2); ++x) {
+                auto rrect = rrectAccess[b][y][x];
+                auto quad = quadsAccess[b][y][x];
+                assign_rrect_to_quad(rrect, quad, cellSize, cellOff,
+                                     static_cast<T>(x),
+                                     static_cast<T>(y));
+            }
+        }
+    }
+    return quads;
+}
+torch::Tensor rrect_to_quads(torch::Tensor rrects, float cellSize)
+{
+    if (rrects.is_cuda()) {
+        return rrect_to_quads_gpu(rrects, cellSize);
+    }
+    torch::Tensor quads;
+    AT_DISPATCH_FLOATING_TYPES(
+        rrects.scalar_type(),
+        "rrect_to_quads_impl",
+        ([&] {
+            quads = rrect_to_quads_impl<scalar_t>(rrects, scalar_t(cellSize));
+        })
+    );
+    return quads;
+}
+template<typename T>
+torch::Tensor rrect_to_quads_backward_impl(torch::Tensor rrects, torch::Tensor gradOutput)
+{
+    // BHW(5)
+    auto gradInput = torch::empty_like(rrects);
+    auto rrectAccess = rrects.accessor<T, 4>();
+    // BHW42
+    auto gradOutputAccess = gradOutput.accessor<T, 5>();
+    auto gradInputAccess = gradInput.accessor<T, 4>();
+    for (long b = 0; b < rrects.size(0); ++b) {
+        for (long y = 0; y < rrects.size(1); ++y) {
+            for (long x = 0; x < rrects.size(2); ++x) {
+                assign_grad_rrect_to_quad<T>(rrectAccess[b][y][x], gradOutputAccess[b][y][x], gradInputAccess[b][y][x]);
+            }
+        }
+    }
+    return gradInput;
+}
+torch::Tensor rrect_to_quads_backward_gpu(torch::Tensor rrects, torch::Tensor gradOutput);
+torch::Tensor rrect_to_quads_backward(torch::Tensor rrects, torch::Tensor gradOutput)
+{
+    if (rrects.is_cuda()) {
+        return rrect_to_quads_backward_gpu(rrects, gradOutput);
+    }
+    torch::Tensor gradInput;
+    AT_DISPATCH_FLOATING_TYPES(
+        rrects.scalar_type(),
+        "rrect_to_quads_backward_impl",
+        ([&] {
+            gradInput = rrect_to_quads_backward_impl<scalar_t>(rrects, gradOutput);
+        })
+    );
+    return gradInput;
+}

nemo-retriever-ocr/cpp/geometry_api/geometry_api.h ADDED Viewed

	@@ -0,0 +1,16 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+torch::Tensor rrect_to_quads(torch::Tensor rrects, float cellSize);
+torch::Tensor rrect_to_quads_backward(torch::Tensor rrects, torch::Tensor gradOutput);
+torch::Tensor calc_poly_min_rrect(torch::Tensor vertices);
+float get_rel_continuation_cos(torch::Tensor rrectA, torch::Tensor rrectB);
+torch::Tensor get_poly_bounds_quad(torch::Tensor poly);

nemo-retriever-ocr/cpp/geometry_api/geometry_api_common.h ADDED Viewed

	@@ -0,0 +1,121 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "../cuda_intellisense.cuh"
+#include "../geometry.h"
+#if defined(__NVCC__)
+#include <math_constants.h>
+#define GEO_PI CUDART_PI_F
+#else
+#include <math.h>
+#define GEO_PI M_PI
+#endif
+template<typename access_t, typename point_t>
+__device__
+inline
+void pt_assign(access_t acc, const point_t &p) {
+    acc[0] = p.X;
+    acc[1] = p.Y;
+}
+template<typename T, typename rrect_access_t>
+__device__ __lib_inline__
+InPlaceQuad_<T> cvt_rrect_to_quad(const rrect_access_t &rrect, T cellSize, T cellOff, T x, T y)
+{
+    typedef Point_<T> Pointf;
+    Pointf prior{
+        x * cellSize + cellOff,
+        y * cellSize + cellOff
+    };
+    T dTop = rrect[0];
+    T dRight = rrect[1];
+    T dBottom = rrect[2];
+    T dLeft = rrect[3];
+    T theta = rrect[4];
+    T piOver2{GEO_PI / 2.0f};
+    Pointf vX{ cos(theta), sin(theta) };
+    Pointf vY{ cos(theta - piOver2), sin(theta - piOver2) };
+    InPlaceQuad_<T> ret;
+    ret[0] = prior - vX * dLeft + vY * dTop;
+    ret[1] = prior + vX * dRight + vY * dTop;
+    ret[2] = prior + vX * dRight - vY * dBottom;
+    ret[3] = prior - vX * dLeft - vY * dBottom;
+    return ret;
+}
+template<typename rrect_access_t, typename quad_access_t, typename T>
+__device__ __lib_inline__
+void assign_rrect_to_quad(const rrect_access_t &rrect, quad_access_t &quad,
+                          T cellSize, T cellOff, T x, T y)
+{
+    const InPlaceQuad_<T> cvQuad = cvt_rrect_to_quad<T>(rrect, cellSize, cellOff, x, y);
+    const T *pInQuad = reinterpret_cast<const T*>(&cvQuad);
+    T *pOutQuad = reinterpret_cast<T*>(quad.data());
+    #pragma unroll
+    for (uint32_t i = 0; i < 8; ++i) {
+        pOutQuad[i] = pInQuad[i];
+    }
+}
+template<typename T, typename rrect_access_t, typename quad_access_t>
+__device__
+inline
+void assign_grad_rrect_to_quad(const rrect_access_t &rrect,
+                               const quad_access_t &gradOutput,
+                               rrect_access_t gradInput)
+{
+    typedef Point_<T> Pointf;
+    T Top = rrect[0];
+    T Right = rrect[1];
+    T Bottom = rrect[2];
+    T Left = rrect[3];
+    T theta = rrect[4];
+    T piOver2{GEO_PI / 2.0f};
+    Pointf vX{ cos(theta), sin(theta) };
+    Pointf vY{ cos(theta - piOver2), sin(theta - piOver2) };
+    Pointf dVX{ -vX.Y, vX.X };
+    Pointf dVY{ -vY.Y, vY.X };
+    Pointf gP0 = gradOutput[0],
+           gP1 = gradOutput[1],
+           gP2 = gradOutput[2],
+           gP3 = gradOutput[3];
+    // Top
+    gradInput[0] = (gP0 * vY + gP1 * vY).Sum();
+    // Right
+    gradInput[1] = (gP1 * vX + gP2 * vX).Sum();
+    // Bottom
+    gradInput[2] = -(gP2 * vY + gP3 * vY).Sum();
+    // Left
+    gradInput[3] = -(gP0 * vX + gP3 * vX).Sum();
+    // Theta
+    gradInput[4] = (
+        gP0 * (-Left * dVX + Top * dVY) +
+        gP1 * (Right * dVX + Top * dVY) +
+        gP2 * (Right * dVX - Bottom * dVY) +
+        gP3 * (-Left * dVX - Bottom * dVY)
+    ).Sum();
+}
+#undef GEO_PI

nemo-retriever-ocr/cpp/geometry_api/geometry_api_gpu.cu ADDED Viewed

	@@ -0,0 +1,142 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "../geometry.h"
+#include "../cuda_intellisense.cuh"
+#include "geometry_api_common.h"
+#include <trove/ptr.h>
+using namespace std;
+template<typename T>
+struct RRect_ {
+    T Data[5];
+    template<typename index_t>
+    __device__
+    const T &operator[](index_t i) const { return Data[i]; }
+    template<typename index_t>
+    __device__
+    T &operator[](index_t i) { return Data[i]; }
+};
+template<typename T>
+__global__
+void device_rrect_to_quads_gpu(torch::PackedTensorAccessor64<T, 2> rrectAccess,
+                               torch::PackedTensorAccessor64<T, 3> quadsAccess,
+                               int64_t numRows, int64_t numCols,
+                               T cellSize)
+{
+    typedef Point_<T> Pointf;
+    typedef RRect_<T> RRectf;
+    typedef InPlaceQuad_<T> Quadf;
+    constexpr T TWO = 2;
+    const int64_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (jobIdx >= rrectAccess.size(0)) {
+        return;
+    }
+    int64_t row = jobIdx / numCols;
+    const int64_t col = jobIdx - (row * numCols);
+    row = row % numRows;
+    auto rawRRect = reinterpret_cast<RRectf*>(rrectAccess.data());
+    auto rawQuad = reinterpret_cast<Quadf*>(quadsAccess.data());
+#if defined(NDEBUG)
+    trove::coalesced_ptr<RRectf> pRRect(rawRRect);
+    trove::coalesced_ptr<Quadf> pQuad(rawQuad);
+#else
+    auto pRRect = rawRRect;
+    auto pQuad = rawQuad;
+#endif
+    RRectf rrect = pRRect[jobIdx];
+    T cellOff = cellSize / TWO;
+    Quadf cvQuad = cvt_rrect_to_quad<T>(rrect, cellSize, cellOff, col, row);
+    pQuad[jobIdx] = cvQuad;
+}
+torch::Tensor rrect_to_quads_gpu(torch::Tensor rrects, float cellSize)
+{
+    if (!rrects.is_contiguous()) {
+        throw std::runtime_error("Expected the rrects to be contiguous!");
+    }
+    torch::Tensor quads = torch::empty({ rrects.size(0), rrects.size(1), rrects.size(2), 4, 2 }, rrects.options());
+    auto rrFlat = rrects.flatten(0, 2);
+    auto qFlat = quads.flatten(0, 2);
+    dim3 blockSize(96);
+    dim3 gridSize(div_up(qFlat.size(0), blockSize.x));
+    if (quads.numel() > 0) {
+        AT_DISPATCH_FLOATING_TYPES(
+            quads.scalar_type(),
+            "cuda_rrect_to_quads",
+            ([&] {
+                device_rrect_to_quads_gpu<scalar_t> KERNEL_ARG2(gridSize, blockSize) (
+                    rrFlat.packed_accessor64<scalar_t, 2>(),
+                    qFlat.packed_accessor64<scalar_t, 3>(),
+                    rrects.size(1), rrects.size(2),
+                    cellSize
+                );
+            })
+        );
+    }
+    return quads;
+}
+template<typename scalar_t>
+__global__
+void device_rrect_to_quads_backward_gpu(torch::PackedTensorAccessor64<scalar_t, 2> rrect,
+                                        torch::PackedTensorAccessor64<scalar_t, 3> gradOutput,
+                                        torch::PackedTensorAccessor64<scalar_t, 2> gradInput)
+{
+    const int64_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (jobIdx >= rrect.size(0)) return;
+    assign_grad_rrect_to_quad<scalar_t>(rrect[jobIdx], gradOutput[jobIdx], gradInput[jobIdx]);
+}
+torch::Tensor rrect_to_quads_backward_gpu(torch::Tensor rrects, torch::Tensor gradOutput)
+{
+    auto gradInput = torch::empty_like(rrects);
+    auto flatRRects = rrects.reshape({ -1, 5 });
+    auto flatGradOutput = gradOutput.reshape({ -1, 4, 2 });
+    auto flatGradInput = gradInput.reshape({ -1, 5 });
+    dim3 blockSize(32);
+    dim3 gridSize(div_up(rrects.size(0) * rrects.size(1) * rrects.size(2), blockSize.x));
+    if (rrects.numel() > 0) {
+        AT_DISPATCH_FLOATING_TYPES(
+            rrects.scalar_type(),
+            "cuda_rrect_to_quads_backward",
+            ([&] {
+                device_rrect_to_quads_backward_gpu KERNEL_ARG2(gridSize, blockSize) (
+                    flatRRects.packed_accessor64<scalar_t, 2>(),
+                    flatGradOutput.packed_accessor64<scalar_t, 3>(),
+                    flatGradInput.packed_accessor64<scalar_t, 2>()
+                );
+            })
+        );
+    }
+    return gradInput;
+}

nemo-retriever-ocr/cpp/geometry_api/get_rel_continuation_cos.cpp ADDED Viewed

	@@ -0,0 +1,60 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "../geometry.h"
+using namespace std;
+float get_rel_continuation_cos(torch::Tensor rrectATensor, torch::Tensor rrectBTensor)
+{
+    typedef Point_<float> Pointf;
+    if (rrectATensor.size(0) != 4 || rrectBTensor.size(0) != 4) {
+        throw runtime_error("Invalid rrect arguments. Both must have 4 vertices! A=" +
+                                to_string(rrectATensor.size(0)) + ", B=" + to_string(rrectBTensor.size(0)));
+    }
+    auto rrectA = rrectATensor.accessor<float, 2>();
+    auto rrectB = rrectBTensor.accessor<float, 2>();
+    Pointf aPts[4] = {
+        rrectA[0], rrectA[1], rrectA[2], rrectA[3]
+    };
+    auto c1 = (aPts[0] + aPts[3]) / 2.0f;
+    auto c2 = (aPts[1] + aPts[2]) / 2.0f;
+    auto aDir = c2 - c1;
+    auto aLen = length(aDir);
+    if (aLen > 0) {
+        aDir /= aLen;
+    } else {
+        aDir = Pointf{ 1, 0 };
+    }
+    auto centerA = (c1 + c2) / 2.0f;
+    Pointf bPts[4] = {
+        rrectB[0], rrectB[1], rrectB[2], rrectB[3]
+    };
+    auto centerB = (bPts[0] + bPts[1] + bPts[2] + bPts[3]) / 4.0f;
+    auto connDir = centerB - centerA;
+    auto connLen = length(connDir);
+    if (connLen == 0.0f) {
+        return 1.0f;
+    }
+    connDir /= connLen;
+    auto cosT = dot(aDir, connDir);
+    return cosT;
+}

nemo-retriever-ocr/cpp/geometry_api/matrix2x2.h ADDED Viewed

	@@ -0,0 +1,93 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include "../geometry.h"
+struct contiguous_tag{};
+struct transpose_tag{};
+template<typename layout_t, uint32_t R, uint32_t C>
+struct Matrix2x2_Offset;
+template<uint32_t R, uint32_t C>
+struct Matrix2x2_Offset<contiguous_tag, R, C>
+{
+    static const uint32_t OFFSET = R * 2 + C;
+};
+template<uint32_t R, uint32_t C>
+struct Matrix2x2_Offset<transpose_tag, R, C>
+{
+    static const uint32_t OFFSET = C * 2 + R;
+};
+template<typename T, typename layout_t, uint32_t R, uint32_t C>
+struct Matrix2x2_Indexor
+{
+    static const uint32_t OFFSET = Matrix2x2_Offset<layout_t, R, C>::OFFSET;
+    static T &get(T *data) { return data[OFFSET]; }
+    static const T get(const T *data) { return data[OFFSET]; }
+};
+template<typename T>
+struct Matrix2x2
+{
+    Matrix2x2() = default;
+    Matrix2x2(T r0c0, T r0c1, T r1c0, T r1c1)
+        : m_data{ r0c0, r0c1, r1c0, r1c1 }
+    {
+    }
+    Matrix2x2(const Point_<T> &r0, const Point_<T> &r1)
+        : m_data{ r0.X, r0.Y, r1.X, r1.Y }
+    {
+    }
+    Matrix2x2(const Point_<T> &r0, const Point_<T> &r1, transpose_tag)
+        : m_data{ r0.X, r1.X, r0.Y, r1.Y }
+    {
+    }
+    inline T &operator[](uint32_t i) { return m_data[i]; }
+    inline const T operator[](uint32_t i) const { return m_data[i]; }
+    T m_data[4];
+};
+template<typename T, typename layout_t>
+struct Matrix2x2_View
+{
+    Matrix2x2_View(const Matrix2x2<T> &m) : m_data(m.m_data) {}
+    const T *m_data;
+};
+template<uint32_t R, uint32_t C, typename T, typename layout_t>
+const T get(const Matrix2x2_View<T, layout_t> &m)
+{
+    return Matrix2x2_Indexor<T, layout_t, R, C>::get(m.m_data);
+}
+template<typename T, typename get_pt_t, typename callback_t, typename layout_t = contiguous_tag>
+inline
+void matmul_fn(int64_t N, const get_pt_t &get_fn, const Matrix2x2<T> &mat, const callback_t &callback,
+               layout_t lt = layout_t{})
+{
+    Matrix2x2_View<T, layout_t> m{ mat };
+    #pragma omp simd
+    for (int64_t i = 0; i < N; ++i) {
+        Point_<T> pt = get_fn(i);
+        T x = pt.X * get<0, 0>(m) + pt.Y * get<1, 0>(m);
+        T y = pt.X * get<0, 1>(m) + pt.Y * get<1, 1>(m);
+        callback(i, Point_<T>{ x, y });
+    }
+}

nemo-retriever-ocr/cpp/geometry_api/poly_bounds_quad.cpp ADDED Viewed

	@@ -0,0 +1,61 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+using namespace std;
+template<typename T>
+void pt_assign(torch::TensorAccessor<T, 1> acc, T x, T y)
+{
+    acc[0] = x;
+    acc[1] = y;
+}
+template<typename T>
+void poly_bounds_quad_impl(torch::TensorAccessor<T, 2> poly, torch::TensorAccessor<T, 2> outBounds)
+{
+    T minX = poly[0][0],
+      minY = poly[0][1],
+      maxX = poly[0][0],
+      maxY = poly[0][1];
+    const int64_t numVertices = poly.size(0);
+    for (int64_t i = 0; i < numVertices; ++i) {
+        auto vert = poly[i];
+        minX = min(minX, vert[0]);
+        maxX = max(maxX, vert[0]);
+        minY = min(minY, vert[1]);
+        maxY = max(maxY, vert[1]);
+    }
+    pt_assign(outBounds[0], minX, minY);
+    pt_assign(outBounds[1], maxX, minY);
+    pt_assign(outBounds[2], maxX, maxY);
+    pt_assign(outBounds[3], minX, maxY);
+}
+torch::Tensor get_poly_bounds_quad(torch::Tensor poly)
+{
+    auto ret = torch::empty({ 4, 2 }, poly.options());
+    AT_DISPATCH_FLOATING_TYPES(
+        poly.scalar_type(),
+        "poly_bounds_quad_impl",
+        ([&] {
+            poly_bounds_quad_impl(
+                poly.accessor<scalar_t, 2>(),
+                ret.accessor<scalar_t, 2>()
+            );
+        })
+    );
+    return ret;
+}

nemo-retriever-ocr/cpp/graph_detection/encode_util.cpp ADDED Viewed

	@@ -0,0 +1,272 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "encode_util.h"
+#include <algorithm>
+#include <numeric>
+#include <sstream>
+#include "../third_party/clipper/clipper.hpp"
+using namespace std;
+namespace graph_detection {
+template<typename T>
+struct Candidate : Edge {
+    T C;
+    Candidate() = default;
+    Candidate(int32_t a, int32_t b, T c) : Edge(a, b), C(c) {}
+};
+struct DistStruct {
+    Candidate<Pointf> A;
+    Candidate<Pointf> B;
+    float Dist;
+    DistStruct() = default;
+    DistStruct(Candidate<Pointf> a, Candidate<Pointf> b, float dist) : A(a), B(b), Dist(dist) {}
+};
+template<typename T>
+float vec_cos(const Point_<T> &a, const Point_<T> &b)
+{
+    return dot(a, b) / (length(a) * length(b) + 1e-8);
+}
+template<typename T, typename Fn = std::less<T>>
+vector<size_t> arg_sort(const vector<T> &vec, Fn comp = Fn())
+{
+    vector<size_t> ret;
+    ret.reserve(vec.size());
+    for (size_t i = 0; i < vec.size(); ++i) {
+        ret.push_back(i);
+    }
+    sort(begin(ret), end(ret),
+        [&vec, &comp] (size_t idxA, size_t idxB) {
+            return comp(vec[idxA], vec[idxB]);
+        }
+    );
+    return ret;
+}
+float edge_length(const Polygon_<float> &poly, const vector<Edge> &edges);
+vector<Edge> find_bottom(const Polygon_<float> &poly, bool useVertexOrder)
+{
+    if (poly.Count < 4) {
+        throw runtime_error("Invalid polygon. Fewer than 4 vertices!");
+    }
+    // If we trust the source of the geometries, then this saves us both computation,
+    // but can also be more reliable since we won't reorder the vertices
+    if (useVertexOrder) {
+        if ((poly.Count % 2) == 1) {
+            throw runtime_error("Can't use trusted vertex order when the vertex count is odd!");
+        }
+        int32_t halfCt = poly.Count / 2;
+        return { { halfCt - 1, halfCt },
+                 { static_cast<int32_t>(poly.Count) - 1, 0 } };
+    }
+    if (poly.Count == 4) {
+        float d1 = length(poly[1] - poly[0]) + length(poly[2] - poly[3]);
+        float d2 = length(poly[2] - poly[1]) + length(poly[0] - poly[3]);
+        if (4 * d1 < d2) {
+            return { { 0, 1 }, { 2, 3 } };
+        } else {
+            return { { 1, 2 }, { 3, 0 } };
+        }
+    }
+    auto idx_wrap = [&poly] (size_t idx) {
+        return poly[idx % poly.Count];
+    };
+    vector<Candidate<float>> candidates;
+    for (size_t i = 1; i < (poly.Count + 1); ++i) {
+        auto vPrev = idx_wrap(i) - idx_wrap(i - 1);
+        auto vNext = idx_wrap(i + 2) - idx_wrap(i + 1);
+        // We're looking for the segment where the preceding and following segment
+        // essentially travel in opposite directions
+        if (vec_cos(vPrev, vNext) < -0.875f) {
+            auto currSeg = idx_wrap(i) - idx_wrap(i + 1);
+            candidates.emplace_back(i % poly.Count, (i + 1) % poly.Count, length(currSeg));
+        }
+    }
+    if (candidates.size() != 2 || candidates[0].A == candidates[1].B || candidates[0].B == candidates[1].A) {
+        // If candidate number < 2, or two bottom are joined, select 2 farthest edge
+        vector<Candidate<Pointf>> midList;
+        for (size_t i = 0; i < poly.Count; ++i) {
+            Pointf midPoint = (idx_wrap(i) + idx_wrap(i + 1)) / 2.0f;
+            midList.emplace_back(i, (i + 1) % poly.Count, midPoint);
+        }
+        vector<DistStruct> distList;
+        // Only found one good candidate, so search for the edge that's the furthest from this candidate
+        if (candidates.size() == 1) {
+            auto idx1a = candidates.back().A;
+            auto idx1b = candidates.back().B;
+            Candidate<Pointf> cand1{ idx1a, idx1b, (idx_wrap(idx1a) + idx_wrap(idx1b)) / 2.0f };
+            for (size_t j = 0; j < poly.Count; ++j) {
+                auto &cand2 = midList[j];
+                if (cand1.Touches(cand2)) continue;
+                float dist = length(cand1.C - cand2.C);
+                distList.emplace_back(cand1, cand2, dist);
+            }
+        } else {
+            for (size_t i = 0; i < poly.Count; ++i) {
+                for (size_t j = i + 1; j < poly.Count; ++j) {
+                    auto &cand1 = midList[i];
+                    auto &cand2 = midList[j];
+                    if (cand1.Touches(cand2)) continue;
+                    float dist = length(cand1.C - cand2.C);
+                    distList.emplace_back(cand1, cand2, dist);
+                }
+            }
+        }
+        sort(begin(distList), end(distList), [] (auto a, auto b) { return a.Dist < b.Dist; });
+        if (distList.empty()) {
+            throw runtime_error("No valid bottom candidates found for this polygon!");
+        }
+        auto &bEdge = distList.back();
+        return vector<Edge>{ bEdge.A, bEdge.B };
+    } else {
+        return vector<Edge>{ candidates[0], candidates[1] };
+    }
+}
+void find_long_edges(const Polygon_<float> &poly, Edge *bottoms, vector<Edge> &outLongEdge1, vector<Edge> &outLongEdge2)
+{
+    int32_t b1End = bottoms[0].B;
+    int32_t b2End = bottoms[1].B;
+    int32_t nPoints = poly.Count;
+    auto accum_into = [nPoints] (int32_t end1, int32_t end2, vector<Edge> &outEdge) {
+        int32_t i = (end1 + 1) % nPoints;
+        while ((i % nPoints) != end2) {
+            int32_t start = i > 0 ? i - 1 : nPoints - 1;
+            int32_t end = i % nPoints;
+            outEdge.emplace_back(start, end);
+            i = (i + 1) % nPoints;
+        }
+    };
+    accum_into(b1End, b2End, outLongEdge1);
+    accum_into(b2End, b1End, outLongEdge2);
+}
+float edge_length(const Polygon_<float> &poly, const vector<Edge> &edges)
+{
+    float ret = 0.0f;
+    for (const Edge &e : edges) {
+        ret += length(poly[e.B] - poly[e.A]);
+    }
+    return ret;
+}
+vector<float> edge_lengths(const Polygon_<float> &poly, const vector<Edge> &edges)
+{
+    if (edges.empty()) {
+        throw runtime_error("Found an empty edge!");
+    }
+    vector<float> ret;
+    ret.reserve(edges.size());
+    for (const Edge &e : edges) {
+        ret.push_back(length(poly[e.B] - poly[e.A]));
+    }
+    return ret;
+}
+void split_edge_sequence(const Polygon_<float> &poly, const vector<Edge> &edges,
+                         const vector<float> &edgeLengths, float nParts,
+                         vector<Pointf> &outPts);
+void split_edge_sequence_by_step(const Polygon_<float> &poly, const vector<Edge> &longEdge1, const vector<Edge> &longEdge2,
+                                 float step, vector<Pointf> &outInnerPoints1, vector<Pointf> &outInnerPoints2)
+{
+    auto edgeLengths1 = edge_lengths(poly, longEdge1);
+    auto edgeLengths2 = edge_lengths(poly, longEdge2);
+    float totalLength = (accumulate(begin(edgeLengths1), end(edgeLengths1), 0.0f) + accumulate(begin(edgeLengths2), end(edgeLengths2), 0.0f)) / 2;
+    float nParts = max<float>(ceil(totalLength / step), 2);
+    split_edge_sequence(poly, longEdge1, edgeLengths1, nParts, outInnerPoints1);
+    split_edge_sequence(poly, longEdge2, edgeLengths2, nParts, outInnerPoints2);
+}
+void split_edge_sequence(const Polygon_<float> &poly, const vector<Edge> &edges,
+                         const vector<float> &edgeLengths, float nParts,
+                         vector<Pointf> &outPts)
+{
+    vector<float> elCumSum = vec_cumsum(edgeLengths);
+    float totalLength = elCumSum.back();
+    float lengthPerPart = totalLength / (nParts - 1);
+    size_t iNumParts = nParts;
+    size_t currNode = 0;
+    size_t ctr = 0;
+    for (float i = 0.0f; ctr < iNumParts; i += 1.0f, ++ctr) {
+        float t = min(i * lengthPerPart, totalLength);
+        while (t > elCumSum[currNode + 1]) {
+            ++currNode;
+        }
+        Edge currEdge = edges[currNode];
+        Pointf e1 = poly[currEdge.A];
+        Pointf e2 = poly[currEdge.B];
+        float currLen = edgeLengths[currNode];
+        Pointf sampledPt;
+        if (currLen > 0) {
+            float deltaT = t - elCumSum[currNode];
+            float ratio = deltaT / currLen;
+            sampledPt = e1 + ratio * (e2 - e1);
+        } else {
+            sampledPt = e1;
+        }
+        outPts.push_back(sampledPt);
+    }
+}
+string print_poly(const Polyf &poly) {
+    ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < poly.Count; ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << "(" << poly[i].X << ", " << poly[i].Y << ")";
+    }
+    oss << "]";
+    return oss.str();
+}
+} // namespace graph_detection

nemo-retriever-ocr/cpp/graph_detection/encode_util.h ADDED Viewed

	@@ -0,0 +1,184 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <vector>
+#include <random>
+#include <algorithm>
+#include "../geometry.h"
+namespace graph_detection {
+struct Edge {
+    int32_t A;
+    int32_t B;
+    Edge() = default;
+    Edge(int32_t a, int32_t b) : A(a), B(b) {}
+    bool Touches(int32_t idx) const { return A == idx || B == idx; }
+    bool Touches(const Edge &other) const;
+};
+inline
+bool edge_touches(const Edge &edge, int32_t vertex) {
+    return edge.A == vertex || edge.B == vertex;
+}
+inline
+bool Edge::Touches(const Edge &other) const {
+    return edge_touches(other, A) || edge_touches(other, B);
+}
+typedef Point_<float> Pointf;
+typedef AABB_<float> AABBf;
+typedef Polygon_<float> Polyf;
+typedef std::vector<Pointf> Polyline;
+std::vector<Edge> find_bottom(const Polygon_<float> &poly, bool useVertexOrder);
+void find_long_edges(const Polygon_<float> &poly, Edge *bottoms, std::vector<Edge> &outLongEdge1, std::vector<Edge> &outLongEdge2);
+void split_edge_sequence_by_step(const Polygon_<float> &poly, const std::vector<Edge> &longEdge1, const std::vector<Edge> &longEdge2,
+                                 float step, std::vector<Pointf> &outInnerPoints1, std::vector<Pointf> &outInnerPoints2);
+std::string print_poly(const Polyf &poly);
+template<typename T>
+inline
+std::vector<T> vec_cumsum(const std::vector<T> &v)
+{
+    std::vector<T> ret;
+    ret.reserve(v.size() + 1);
+    ret.push_back(0);
+    for (T val : v) {
+        ret.push_back(ret.back() + val);
+    }
+    return ret;
+}
+template<typename RandEng, typename Fn>
+inline
+void n_choose_k(size_t n, size_t k, RandEng &randEng, Fn fn)
+{
+    if (k == 0) return;
+    // TODO(mranzinger): This algorithm can be replaced with sampling from a geometric
+    // distribution, which drastically reduces the runtime complexity
+    for (size_t i = 0; i < n; ++i) {
+        size_t leftover = n - i;
+        if (leftover <= k) {
+            fn(i);
+            --k;
+        } else {
+            float p = std::uniform_real_distribution<float>(0.0f, 1.0f)(randEng);
+            float probSample = float{k} / float{leftover};
+            if (p < probSample) {
+                fn(i);
+                --k;
+            }
+        }
+    }
+}
+template<typename T>
+inline T clamp(T val, T minVal, T maxVal) {
+    return std::max(std::min(val, maxVal), minVal);
+}
+inline
+Pointf avg_point(const std::vector<Pointf> &points)
+{
+    return std::accumulate(std::begin(points), std::end(points), Pointf(0,0)) / float(points.size());
+}
+inline
+float vector_sin(const Pointf &pt)
+{
+    // sin = y / len(pt)
+    return pt.Y / (length(pt) + 1e-8);
+}
+inline
+float vector_cos(const Pointf &pt)
+{
+    // cos = x / len(pt)
+    return pt.X / (length(pt) + 1e-8);
+}
+inline
+void vector_cos_sin(const Pointf & pt, float &outCos, float &outSin)
+{
+    float len = length(pt) + 1e-8;
+    outCos = pt.X / len;
+    outSin = pt.Y / len;
+}
+inline
+float point_dist_to_line(const Pointf &l1, const Pointf &l2, const Pointf &pt)
+{
+    auto d = l2 - l1;
+    auto lineLen = length(d);
+    if (lineLen > 0) {
+        float distance = abs(
+              d.Y * pt.X
+            - d.X * pt.Y
+            + l2.X * l1.Y
+            - l2.Y * l1.X
+        ) / lineLen;
+        return distance;
+    } else {
+        return length(pt - l1);
+    }
+}
+template<typename T>
+T find_mode(std::vector<T> &inputs) {
+    using std::sort;
+    using std::begin;
+    using std::end;
+    if (inputs.empty()) {
+        throw std::runtime_error("Cannot find mode of empty distribution!");
+    }
+    sort(begin(inputs), end(inputs));
+    T currVal = inputs[0];
+    size_t currCount = 1;
+    T modeVal = inputs[0];
+    size_t modeCount = 1;
+    auto commitCurr = [&] () {
+        if (currCount > modeCount) {
+            modeCount = currCount;
+            modeVal = currVal;
+        }
+    };
+    for (size_t i = 1; i < inputs.size(); ++i) {
+        if (inputs[i] == currVal) {
+            ++currCount;
+        } else {
+            // Start of a new value
+            commitCurr();
+            currCount = 1;
+            currVal = inputs[i];
+        }
+    }
+    commitCurr();
+    return modeVal;
+}
+} // namespace graph_detection

nemo-retriever-ocr/cpp/half_ops.cu ADDED Viewed

	@@ -0,0 +1,5 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "half_ops.cuh"

nemo-retriever-ocr/cpp/half_ops.cuh ADDED Viewed

	@@ -0,0 +1,149 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "cuda_intellisense.cuh"
+#ifndef __CUDACC__
+#pragma message("__CUDACC__ not defined!")
+#else
+#pragma message("__CUDACC__ defined!")
+#endif
+#ifdef __NVCC__
+#define __qr_device__ __device__
+#define __qr_host__ __host__
+#define __qr_inline__ __forceinline__
+#else
+#define __qr_device__
+#define __qr_host__
+#define __qr_inline__ inline
+#endif
+#ifdef __CUDACC__
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+__qr_inline__ __device__ __half operator-(__half v) {
+    return __hneg(v);
+}
+__qr_inline__ __device__ __half operator+(__half a, __half b) {
+    return __hadd(a, b);
+}
+__qr_inline__ __device__ __half operator-(__half a, __half b) {
+    return __hsub(a, b);
+}
+__qr_inline__ __device__ __half operator*(__half a, __half b) {
+    return __hmul(a, b);
+}
+__qr_inline__ __device__ __half operator/(__half a, __half b) {
+    return __hdiv(a, b);
+}
+__qr_inline__ __device__ bool operator==(__half a, __half b) {
+    return __heq(a, b);
+}
+__qr_inline__ __device__ bool operator<(__half a, __half b) {
+    return __hlt(a, b);
+}
+__qr_inline__ __device__ bool operator>(__half a, __half b) {
+    return __hgt(a, b);
+}
+__qr_inline__ __device__ __half sqrt(__half v) {
+    return hsqrt(v);
+}
+__qr_inline__ __device__ __half floor(__half v) {
+    return hfloor(v);
+}
+__qr_inline__ __device__ __half ceil(__half v) {
+    return hceil(v);
+}
+__qr_inline__ __device__ __half max(__half a, __half b) {
+    return a > b ? a : b;
+}
+#endif //__CUDACC__
+template<typename Src, typename Dest>
+struct Convert {
+    __qr_inline__ static __qr_host__ __qr_device__ constexpr Dest From(Src value) { return static_cast<Dest>(value); }
+    __qr_inline__ static __qr_host__ __qr_device__ constexpr Src To(Dest value) { return static_cast<Src>(value); }
+    __qr_inline__ static __qr_host__ __qr_device__ constexpr Dest LeftToRight(Src value) { return static_cast<Dest>(value); }
+    __qr_inline__ static __qr_host__ __qr_device__ constexpr Src RightToLeft(Dest value) { return static_cast<Src>(value); }
+};
+#ifdef __CUDACC__
+template<>
+struct Convert<__half, float> {
+    __qr_inline__ static __host__ __device__ float From(__half value) { return __half2float(value); }
+    __qr_inline__ static __host__ __device__ __half To(float value) { return __float2half(value); }
+    __qr_inline__ static __host__ __device__ float LeftToRight(__half value) { return __half2float(value); }
+    __qr_inline__ static __host__ __device__ __half RightToLeft(float value) { return __float2half(value); }
+};
+template<typename Dest>
+struct Convert<__half, Dest> : Convert<__half, float> {
+};
+namespace at {
+template<>
+inline __half* TensorBase::mutable_data_ptr() const {
+    TORCH_CHECK(scalar_type() == ScalarType::Half,
+                "expected scalar type Half but found ",
+                c10::toString(scalar_type()));
+    return static_cast<__half*>(this->unsafeGetTensorImpl()->mutable_data());
+}
+template<>
+inline __half* TensorBase::data_ptr() const {
+    TORCH_CHECK(scalar_type() == ScalarType::Half,
+                "expected scalar type Half but found ",
+                c10::toString(scalar_type()));
+    return static_cast<__half*>(this->unsafeGetTensorImpl()->mutable_data());
+}
+}
+template<typename T>
+struct remap_half {
+    typedef T type;
+};
+template<>
+struct remap_half<at::Half> {
+    typedef __half type;
+};
+template<typename T>
+__half to_half(T val) {
+    return Convert<__half, T>::RightToLeft(val);
+}
+template<typename T>
+struct fp_promote {
+    typedef T type;
+};
+template<>
+struct fp_promote<__half> {
+    typedef float type;
+};
+#endif //__CUDACC__

nemo-retriever-ocr/cpp/local_ips/local_ips.h ADDED Viewed

	@@ -0,0 +1,11 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+torch::Tensor ragged_quad_all_2_all_distance_v2(torch::Tensor embedQuads, torch::Tensor quadsPerExample,
+                                                float xFactor, float yFactor,
+                                                bool allowSelfDistance);

nemo-retriever-ocr/cpp/local_ips/quad_all_2_all_dist_v2.cu ADDED Viewed

	@@ -0,0 +1,162 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include <iostream>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include "local_ips.h"
+#include "../cuda_intellisense.cuh"
+#include "../common.h"
+#include "../geometry.h"
+using namespace std;
+namespace cg = cooperative_groups;
+typedef Point_<float> Pointf;
+__device__ inline
+float square(float val) { return val * val; }
+__global__
+void device_quad_all_2_all_distance_v2(torch::PackedTensorAccessor64<float, 4> allEmbedQuads,
+                                       torch::PackedTensorAccessor64<int64_t, 1> allRegionCounts,
+                                       torch::PackedTensorAccessor64<int64_t, 1> csWorkPerExample,
+                                       torch::PackedTensorAccessor64<float, 3> outDistances,
+                                       float xFactor, float yFactor,
+                                       bool allowSelfDistance)
+{
+    // Note that the blockIdx.x is on purpose here
+    int64_t workIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    if (workIdx >= csWorkPerExample[csWorkPerExample.size(0) - 1]) return;
+    auto exIter = thrust::upper_bound(thrust::seq,
+                                      csWorkPerExample.data(), csWorkPerExample.data() + csWorkPerExample.size(0),
+                                      workIdx);
+    const int64_t exIdx = exIter - csWorkPerExample.data();
+    const int64_t workStart = exIdx == 0 ? 0 : csWorkPerExample[exIdx - 1];
+    const int64_t workOff = workIdx - workStart;
+    const int64_t row = workOff / allRegionCounts[exIdx];
+    const int64_t col = workOff % allRegionCounts[exIdx];
+    auto taRowQuad = allEmbedQuads[exIdx][row];
+    auto taColQuad = allEmbedQuads[exIdx][col];
+    Quad_<float> rowQuad(taRowQuad.data()),
+                 colQuad(taColQuad.data());
+    auto p1 = (rowQuad[0] + rowQuad[3]) / 2.0f;
+    auto p2 = (rowQuad[1] + rowQuad[2]) / 2.0f;
+    auto vX = p2 - p1;
+    auto lenVX = length(vX);
+    if (lenVX > 0) {
+        vX = vX / max(lenVX, 1e-8f);
+    } else {
+        vX = { 1, 0 };
+    }
+    Pointf vY{ -vX.Y, vX.X };
+    auto reproj = [&vX, &vY, xFactor, yFactor] (const Pointf &pt) {
+        auto dX = dot(pt, vX);
+        if (dX >= 0) {
+            dX *= xFactor;
+        }
+        auto dY = dot(pt, vY);
+        if (dY >= 0) {
+            dY *= yFactor;
+        }
+        return Pointf{ dX, dY };
+    };
+    auto tile16 = cg::tiled_partition<16>(cg::this_thread_block());
+    // Figure out which vertices this thread is processing
+    const int64_t rowVertexIdx = tile16.thread_rank() / 4;
+    const int64_t colVertexIdx = tile16.thread_rank() % 4;
+    float dist;
+    if (row != col) {
+        Segment_<float> rowSeg{ rowQuad[rowVertexIdx], rowQuad[(rowVertexIdx + 1) % 4] };
+        Segment_<float> colSeg{ colQuad[colVertexIdx], colQuad[(colVertexIdx + 1) % 4] };
+        Segment_<float> minSeg = shortest_line_between_segments(rowSeg, colSeg);
+        Point_<float> vSeg = minSeg.B - minSeg.A;
+        vSeg = reproj(vSeg);
+        dist = length(vSeg);
+    } else if (allowSelfDistance) {
+        dist = 0;
+    } else {
+        dist = numeric_limits<float>::infinity();
+    }
+    // Now find the minimum distance across the group
+    int lane = tile16.thread_rank();
+    // Each iteration halves the number of active threads
+    // Each thread gets the partial min[i] to min[lane+i]
+    #pragma unroll
+    for (uint32_t i = 1; i < 16; i <<= 1) {
+        auto otherDist = tile16.shfl_down(dist, i);
+        dist = min(dist, otherDist);
+    }
+#ifndef NDEBUG
+    float lowestDist = tile16.shfl(dist, 0);
+    assert(dist >= lowestDist);
+#endif
+    if (lane == 0) {
+        outDistances[exIdx][row][col] = dist;
+    }
+}
+torch::Tensor ragged_quad_all_2_all_distance_v2(torch::Tensor embedQuads, torch::Tensor regionCounts,
+                                                float xFactor, float yFactor,
+                                                bool allowSelfDistance)
+{
+    if (!embedQuads.is_contiguous()) {
+        throw std::runtime_error("Expected `embedQuads` to be contiguous!");
+    }
+    auto outDistances = torch::zeros({ embedQuads.size(0), embedQuads.size(1), embedQuads.size(1) },
+                                     embedQuads.options());
+    if (embedQuads.numel() == 0) {
+        return outDistances;
+    }
+    auto workPerExample = regionCounts * regionCounts;
+    auto csWorkPerExample = torch::cumsum(workPerExample, 0);
+    int64_t totalWork = csWorkPerExample[-1].item<int64_t>();
+    dim3 blockSize(16, 2);
+    dim3 gridSize(div_up(totalWork, blockSize.y), 1);
+    device_quad_all_2_all_distance_v2 KERNEL_ARG2(gridSize, blockSize) (
+        embedQuads.packed_accessor64<float, 4>(),
+        regionCounts.packed_accessor64<int64_t, 1>(),
+        csWorkPerExample.packed_accessor64<int64_t, 1>(),
+        outDistances.packed_accessor64<float, 3>(),
+        xFactor, yFactor,
+        allowSelfDistance
+    );
+    return outDistances;
+}

nemo-retriever-ocr/cpp/module.cpp ADDED Viewed

	@@ -0,0 +1,125 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "quad_rectify/quad_rectify.h"
+#include "non_maximal_suppression/non_maximal_suppression.h"
+#include "geometry_api/geometry_api.h"
+#include "beam_decode/beam_decode.h"
+#include "better_grid_sample/grid_sample.h"
+#include "sparse_select/sparse_select.h"
+#include "text_region_grouping/text_region_grouping.h"
+#include "local_ips/local_ips.h"
+#include <torch/extension.h>
+#include <pybind11/stl.h>
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+      m.def("quad_rectify_calc_quad_width", &quad_rectify_calc_quad_width,
+            "Quad Rectify Calc Quad Width C++",
+            py::arg("quads"),
+            py::arg("output_height"),
+            py::arg("round_factor") = 16,
+            py::arg("max_width") = 0
+      );
+      m.def("quad_rectify_forward", &quad_rectify_forward, "Quad Rectify Forward C++",
+            py::arg("quads"),
+            py::arg("image_height"), py::arg("image_width"),
+            py::arg("output_height"), py::arg("output_width"),
+            py::arg("isotropic") = true
+      );
+      m.def("quad_rectify_backward", &quad_rectify_backward, "Quad Rectify Backward C++",
+            py::arg("quads"), py::arg("grad_output"),
+            py::arg("image_height"), py::arg("image_width"),
+            py::arg("isotropic") = true
+      );
+      m.def("quad_non_maximal_suppression", &quad_non_maximal_suppression, "Quad Non-Maximal Suppression C++",
+            py::arg("quads"), py::arg("probs"),
+            py::arg("prob_threshold"), py::arg("iou_threshold"),
+            py::arg("kernel_height"), py::arg("kernel_width"),
+            py::arg("max_regions"),
+            py::arg("verbose") = false
+      );
+      py::class_<LanguageModel>(m, "LanguageModel");
+      m.def("beam_decode", &beam_decode, "beam_decode c++",
+            py::arg("probs"),
+            py::arg("beam_size") = 100,
+            py::arg("blank") = 0,
+            py::arg("min_prob") = 0.001,
+            py::arg("lang_model") = static_cast<LanguageModel*>(nullptr),
+            py::arg("lm_weight") = 1,
+            py::arg("combine_duplicates") = true
+      );
+      py::class_<TokenMappingWrapper, TokenMappingWrapper::Ptr>(m, "TokenMapping");
+      m.def("create_token_mapping", &create_token_mapping, "create token mapping c++",
+            py::arg("token_mapping")
+      );
+      m.def("decode_sequences", &decode_sequences, "decode_sequences c++",
+            py::arg("tokens"), py::arg("language_model"),
+            py::arg("probs") = nullptr
+      );
+      m.def("create_sbo_lm", &create_sbo_lm, "create_sbo_lm c++",
+            py::arg("data_file_path"),
+            py::arg("token_mapping"),
+            py::arg("backoff") = 0.4
+      );
+      m.def("indirect_grid_sample_forward", &indirect_grid_sample_forward, "indirect_grid_sample::forward c++",
+            py::arg("input"), py::arg("grid"), py::arg("input_indices"), py::arg("method")
+      );
+      m.def("indirect_grad_sample_backward", &indirect_grad_sample_backward, "indirect_grid_sample::backward c++",
+            py::arg("grad_output"), py::arg("input"), py::arg("grid"), py::arg("input_indices"), py::arg("method")
+      );
+      m.def("region_counts_to_indices", &region_counts_to_indices, "region counts to indices",
+            py::arg("region_counts"), py::arg("num_outputs")
+      );
+      m.def("rrect_to_quads", &rrect_to_quads, "convert rotated rectangle to quadrangles",
+            py::arg("rrects"), py::arg("cell_size")
+      );
+      m.def("rrect_to_quads_backward", &rrect_to_quads_backward, "gradient of rrect_to_quads",
+            py::arg("rrects"), py::arg("grad_output")
+      );
+      m.def("sparse_select", &sparse_select, "Select sparse tensor(s) given a set of indices",
+            py::arg("sparse_counts"), py::arg("sparse_tensors"), py::arg("select_indices")
+      );
+      m.def("text_region_grouping", &text_region_grouping, "Clusters all of the text into lines and phrases",
+            py::arg("quads"), py::arg("counts"),
+            py::arg("horizontal_tolerance") = 2.0f,
+            py::arg("vertical_tolerance") = 0.5f,
+            py::arg("verbose") = false
+      );
+      m.def("dense_relations_to_graph", &dense_relations_to_graph, "Converts a dense relational tensor to a graph",
+            py::arg("relations")
+      );
+      m.def("ragged_quad_all_2_all_distance_v2", &ragged_quad_all_2_all_distance_v2, "get the all-to-all distances in ragged-batch quad mode",
+            py::arg("embed_quads"), py::arg("region_counts"),
+            py::arg("x_factor") = 1.0f,
+            py::arg("y_factor") = 1.0f,
+            py::arg("allow_self_distance") = true
+      );
+      m.def("calc_poly_min_rrect", &calc_poly_min_rrect, "calculate a reasonable bounding rectangle for a given text polygon",
+            py::arg("vertices")
+      );
+      m.def("get_rel_continuation_cos", &get_rel_continuation_cos, "c++ get relation cosine between 2 regions",
+            py::arg("rrect_a"), py::arg("rrect_b")
+      );
+      m.def("get_poly_bounds_quad", &get_poly_bounds_quad, "c++ get polygon bounds",
+            py::arg("poly")
+      );
+}

nemo-retriever-ocr/cpp/non_maximal_suppression/cpu_non_maximal_suppression.cpp ADDED Viewed

	@@ -0,0 +1,209 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "non_maximal_suppression.h"
+#include <algorithm>
+#include "../geometry.h"
+using namespace std;
+template<typename scalar_t>
+void visit_node(
+    const torch::TensorAccessor<scalar_t, 4> &quads,
+    const torch::TensorAccessor<scalar_t, 2> &probs,
+    const torch::TensorAccessor<int32_t, 3> &adjacency,
+    MergeQuad_<scalar_t> &mQuad,
+    unordered_set<int32_t> &visited,
+    int64_t r, int64_t c, int32_t vIdx)
+{
+    if (visited.count(vIdx)) {
+        return;
+    }
+    visited.insert(vIdx);
+    int32_t *pAdj = adjacency[r][c].data();
+    int32_t adjCt = pAdj[0];
+    assert(adjCt > 0);
+    mQuad.Append(Quad_<scalar_t>(quads[r][c].data()), probs[r][c]);
+    int32_t *pOff = pAdj + 2;
+    int32_t *pEnd = pAdj + adjCt + 1;
+    const int32_t W = quads.size(1);
+    for (; pOff != pEnd; ++pOff) {
+        int32_t vIdx2 = *pOff;
+        int32_t r2 = vIdx2 / W;
+        int32_t c2 = vIdx2 % W;
+        visit_node(quads, probs, adjacency, mQuad, visited, r2, c2, vIdx2);
+    }
+}
+template<typename scalar_t>
+std::vector<torch::Tensor> quad_nms_from_adjacency_impl(
+    const torch::TensorAccessor<scalar_t, 5> &quads,
+    const torch::TensorAccessor<scalar_t, 3> &probs,
+    const torch::TensorAccessor<int32_t, 4> &adjacency,
+    scalar_t probThreshold, scalar_t iouThreshold,
+    int64_t maxRegions)
+{
+    const uint64_t B = quads.size((int)0);
+    const int64_t H = quads.size((int)1);
+    const int64_t W = quads.size((int)2);
+    typedef MergeQuad_<scalar_t> MQuad;
+    typedef EmbedQuad_<scalar_t> EFQuad;
+    vector<vector<EFQuad>> batchQuads{ static_cast< const unsigned int >( B ) };
+    vector<vector<EFQuad>> allQuads{ static_cast< const unsigned int >( B ) };
+    vector<vector<vector<size_t>>> batchAdjIdxs{ static_cast< const unsigned int >( B ) };
+    #pragma omp parallel num_threads (8)
+    {
+        #pragma omp for
+        for (int64_t b = 0; b < B; ++b) {
+            unordered_set<int32_t> visited;
+            for (int64_t r = 0; r < H; ++r) {
+                for (int64_t c = 0; c < W; ++c) {
+                    auto currProb = probs[b][r][c];
+                    if (currProb < probThreshold) {
+                        continue;
+                    }
+                    int32_t vIdx = r * W + c;
+                    // Ensure that this quad hasn't already been merged
+                    if (visited.count(vIdx)) {
+                        continue;
+                    }
+                    MQuad mQuad{ZeroInitTag{}};
+                    visit_node(quads[b], probs[b], adjacency[b], mQuad, visited, r, c, vIdx);
+                    batchQuads[b].push_back(mQuad.Commit());
+                }
+            }
+        }
+        #pragma omp single
+        {
+            for (size_t b = 0; b < B; ++b) {
+                size_t numQuads = batchQuads[b].size();
+                batchAdjIdxs[b].resize(numQuads);
+                for (int64_t n = 0; n < numQuads; ++n) {
+                    #pragma omp task default(none) shared(batchAdjIdxs, batchQuads, iouThreshold) firstprivate(b, numQuads, n)
+                    {
+                        for (int64_t m = n + 1; m < numQuads; ++m) {
+                            vector<size_t> &adjIdxs = batchAdjIdxs[b][n];
+                            vector<EFQuad> &quads = batchQuads[b];
+                            auto iou = quads[n].IOU(quads[m]);
+                            if (iou > iouThreshold) {
+                                adjIdxs.push_back(m);
+                            }
+                        }
+                    }
+                }
+            }
+            #pragma omp taskwait
+        }
+        #pragma omp for
+        for (int64_t batchIdx = 0; batchIdx < B; ++batchIdx) {
+            vector<vector<size_t>> &adjIdxs = batchAdjIdxs[batchIdx];
+            vector<EFQuad> &quads = batchQuads[batchIdx];
+            vector<EFQuad> &finalQuads = allQuads[batchIdx];
+            // Step 3: Using depth first search, merge the regions
+            unordered_set<size_t> visited;
+            for (int64_t n = 0; n < quads.size(); ++n) {
+                EFQuad currQuad;
+                visit_node(quads, n, adjIdxs, currQuad, visited);
+                if (currQuad.NumQuads > 0) {
+                    currQuad.Prepare();
+                    finalQuads.push_back(currQuad);
+                }
+            }
+            // Only sort the part that we want to keep
+            partial_sort(begin(finalQuads),
+                        begin(finalQuads) + std::min<int64_t>(finalQuads.size(), maxRegions),
+                        end(finalQuads),
+                [] (auto a, auto b) {
+                    return a.Confidence > b.Confidence;
+                }
+            );
+            // Truncate the low confidence regions
+            if (finalQuads.size() > maxRegions) {
+                finalQuads.resize(maxRegions);
+            }
+            //cout << "Ex " << batchIdx << " quads:" << endl << finalQuads << endl << endl;
+        }
+    } // End parallel
+    int64_t numOutQuads = 0;
+    for (int64_t batchIdx = 0; batchIdx < B; ++batchIdx) {
+        numOutQuads += allQuads[batchIdx].size();
+    }
+    // Step 4: Convert the quads into tensor representation
+    auto outQuadTensor = torch::empty({ numOutQuads, 4, 2 }, torch::kFloat32);
+    auto outConfTensor = torch::empty({ numOutQuads }, torch::kFloat32);
+    torch::Tensor outCountTensor = torch::empty({ static_cast<int64_t>( allQuads.size() ) }, torch::kInt64);
+    auto outQuadAccess = outQuadTensor.accessor<float, 3>();
+    auto outConfAccess = outConfTensor.accessor<float, 1>();
+    auto outCountAccess = outCountTensor.accessor<int64_t, 1>();
+    int64_t offset = 0;
+    for (int64_t batchIdx = 0; batchIdx < allQuads.size(); ++batchIdx) {
+        vector<EFQuad> &exQuads = allQuads[batchIdx];
+        outCountAccess[batchIdx] = exQuads.size();
+        for (int64_t qIdx = 0; qIdx < exQuads.size(); ++qIdx, ++offset) {
+            copy_quad(exQuads[qIdx], outQuadAccess[offset].data());
+            outConfAccess[offset] = exQuads[qIdx].Confidence;
+        }
+    }
+    return { outQuadTensor, outConfTensor, outCountTensor };
+}
+std::vector<torch::Tensor> quad_nms_from_adjacency(
+    torch::Tensor quads, torch::Tensor probs, torch::Tensor adjacency,
+    float probThreshold, float iouThreshold,
+    int64_t maxRegions)
+{
+    std::vector<torch::Tensor> ret;
+    AT_DISPATCH_FLOATING_TYPES(
+        quads.scalar_type(),
+        "quad_nms_from_adjacency",
+        ([&] {
+            ret = quad_nms_from_adjacency_impl<scalar_t>(
+                quads.accessor<scalar_t, 5>(),
+                probs.accessor<scalar_t, 3>(),
+                adjacency.accessor<int32_t, 4>(),
+                probThreshold, iouThreshold,
+                maxRegions
+            );
+        })
+    );
+    return ret;
+}

nemo-retriever-ocr/cpp/non_maximal_suppression/cuda_non_maximal_suppression.cu ADDED Viewed

	@@ -0,0 +1,1720 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "non_maximal_suppression.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <trove/ptr.h>
+#include "../cuda_intellisense.cuh"
+#include "../geometry.h"
+#include "../common.h"
+#include "../scope_timer.h"
+#include "strided_quad.h"
+// If this flag is turned on, then a bunch of checks will be inserted to ensure that the same results are produced by
+// successive calls to NMS. This means that it makes the library unusable outside of a debug context, so beware!
+//#define NMS_VERIFY_CORRECTNESS
+namespace cg = cooperative_groups;
+namespace ix = torch::indexing;
+inline
+void print_tensor_stats2(const std::string &msg, const torch::Tensor& tensor) {
+    auto fTensor = tensor.to(torch::kDouble).cpu();
+    std::stringstream ss;
+    if (tensor.numel() > 1) {
+        ss << msg << " Size: " << tensor.sizes() << " Type: " << tensor.dtype() << " Device: " << tensor.device() << " Max: " << fTensor.max().item<double>() << " Min: " << fTensor.min().item<double>() << " Mean: " << fTensor.mean().item<double>() << " Std: " << fTensor.std().item<double>();
+    }
+    else if (tensor.numel() == 1) {
+        ss << msg << " Size: " << tensor.sizes() << " Type: " << tensor.dtype() << " Device: " << tensor.device() << " Value: " << fTensor.item<double>() << std::endl;
+    }
+    else {
+        ss << msg << " Size: " << tensor.sizes() << " Type: " << tensor.dtype() << " Device: " << tensor.device() << std::endl;
+    }
+    std::cout << ss.str() << std::endl;
+}
+inline
+void print_tensor_vec_stats2(std::string msg, const std::vector<torch::Tensor>& tensorVec) {
+    std::cout << msg << " Size: " << tensorVec.size() << std::endl;
+    std::stringstream ss;
+    msg = "     - ";
+    for (int i = 0; i < tensorVec.size(); ++i) {
+        ss << msg << "[" << i << "]:";
+        auto tensor = tensorVec[i];
+        print_tensor_stats2(ss.str(), tensor);
+        ss.str("");
+    }
+}
+std::ostream &operator<<(std::ostream &os, dim3 d)
+{
+    return os << "(" << d.x << ", " << d.y << ", " << d.z << ")";
+}
+#define ADD_OP2(vector2_t) __device__ \
+    vector2_t operator+(const vector2_t &a, const vector2_t &b) { \
+        return { a.x + b.x, a.y + b.y }; \
+    }
+ADD_OP2(float2);
+ADD_OP2(double2);
+#undef ADD_OP2
+#define ADD_OP4(vector4_t) __device__ \
+    vector4_t operator+(const vector4_t &a, const vector4_t &b) { \
+        return { a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w }; \
+    }
+ADD_OP4(float4);
+ADD_OP4(double4);
+#undef ADD_OP4
+template<typename T, size_t Size>
+__device__
+std::array<T, Size> operator+(const std::array<T, Size> &a, const std::array<T, Size> &b) {
+    std::array<T, Size> ret;
+    #pragma unroll
+    for (size_t i = 0; i < Size; ++i) {
+        ret._Elems[i] = a._Elems[i] + b._Elems[i];
+    }
+    return ret;
+}
+#if __CUDA_ARCH__ >= 800
+#define __reduce_add_full_warp(val) __reduce_add_sync(0xFFFFFFFF, val)
+#define __reduce_max_full_warp(val) __reduce_max_sync(0xFFFFFFFF, val)
+#define __reduce_min_full_warp(val) __reduce_min_sync(0xFFFFFFFF, val)
+#else
+#define __reduce_add_full_warp(val) cg::reduce(cg::tiled_partition<32>(cg::this_thread_block()), val, cg::plus<decltype(val)>())
+#define __reduce_max_full_warp(val) cg::reduce(cg::tiled_partition<32>(cg::this_thread_block()), val, cg::greater<decltype(val)>())
+#define __reduce_min_full_warp(val) cg::reduce(cg::tiled_partition<32>(cg::this_thread_block()), val, cg::less<decltype(val)>())
+#endif
+template<typename T>
+struct TToVec;
+template<>
+struct TToVec<float> { typedef float2 type2; typedef float4 type4; };
+template<>
+struct TToVec<double> { typedef double2 type2; typedef double4 type4; };
+template<typename T, typename accessor_t>
+__device__
+void write_embed_quad(accessor_t &acc, const MergeQuad_<T> &quad, int64_t storeOff)
+{
+    constexpr auto EMBED_QUAD_SIZE = sizeof(EmbedQuad_<T>) / sizeof(T);
+    static_assert(EMBED_QUAD_SIZE == 10, "Unsupported embed quad size!");
+    const T *mergeBuff = reinterpret_cast<const T*>(&quad);
+    const T confidence = quad.Confidence;
+    const auto i = threadIdx.x;
+    if (i >= 10) {
+        return;
+    }
+    T outVal;
+    // Coordinates
+    if (i < 8) {
+        outVal = mergeBuff[i] / confidence;
+    // Confidence
+    } else if (i == 8) {
+        outVal = confidence / mergeBuff[9];
+    // NumQuads
+    } else {
+        outVal = mergeBuff[9];
+    }
+    acc[i][storeOff] = outVal;
+}
+template<typename group_t, typename ...Args>
+__device__
+void ordered_print(group_t &group, const char *const fmt, const Args& ...args)
+{
+    for (uint32_t i = 0; i < group.size(); ++i) {
+        if (group.thread_rank() == i) {
+            printf(fmt, args...);
+        }
+        group.sync();
+    }
+}
+template<typename T>
+__global__
+void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
+    torch::PackedTensorAccessor64<T, 3> allConfs,
+    T confThreshold, T iouThreshold,
+    torch::PackedTensorAccessor64<int32_t, 1> allOutCounts,
+    torch::PackedTensorAccessor64<T, 3> allOutEmbedQuads
+#ifdef NMS_VERIFY_CORRECTNESS
+    , torch::PackedTensorAccessor64<int32_t, 2> allOutIds
+#endif
+    )
+{
+    typedef InPlaceQuad_<T> Quadf;
+    static_assert(sizeof(Quadf) == sizeof(T) * 8, "Invalid QuadMem size!");
+    constexpr uint32_t ALL_MASK = 0xFFFFFFFF;
+    constexpr uint32_t WARP_SIZE = 32;
+    constexpr T MIN_VALID_AREA = 8;
+    const uint32_t B = allQuads.size(0);
+    const uint32_t H = allQuads.size(1);
+    const uint32_t b = blockIdx.z;
+    const uint32_t r = blockIdx.y * blockDim.y + threadIdx.y;
+    if (r >= H) {
+        return;
+    }
+    #define threadRank threadIdx.x
+    auto rawQuads = reinterpret_cast<Quadf*>(allQuads[b][r].data());
+#if defined(NDEBUG)
+    trove::coalesced_ptr<Quadf> quads(rawQuads);
+#else
+    auto quads = rawQuads;
+#endif
+    auto confs = allConfs[b][r];
+    T conf = confs[threadRank];
+    bool quadValid = conf >= confThreshold;
+    uint32_t ballot = __ballot_sync(ALL_MASK, quadValid);
+    // No valid quads in this window, so we're done!
+    if (ballot == 0) {
+        return;
+    }
+    const Quadf currQuad = quads[threadRank];
+    const T qArea = currQuad.Area();
+    quadValid = quadValid && qArea > MIN_VALID_AREA;
+    ballot = __ballot_sync(ALL_MASK, quadValid);
+    if (ballot == 0) {
+        return;
+    }
+    if (! quadValid) {
+        conf = 0;
+    }
+    MergeQuad_<T> qAccum{ZeroInitTag{}};
+    Quadf prevQuad;
+    auto pCurrQuad = reinterpret_cast<const T*>(&currQuad);
+    auto pPrevQuad = reinterpret_cast<T*>(&prevQuad);
+    #pragma unroll
+    for (uint32_t i = 0; i < 8; ++i) {
+        pPrevQuad[i] = __shfl_up_sync(ALL_MASK, pCurrQuad[i], 1);
+    }
+    T prevConf = __shfl_up_sync(ALL_MASK, conf, 1);
+    if (threadRank == 0) {
+        prevConf = 0;
+    }
+    bool iouValid = false;
+    T iou = 0;
+    if (quadValid) {
+        qAccum.Append(currQuad, conf);
+        if (prevConf >= confThreshold) {
+            iou = prevQuad.IOU_UpperBound(currQuad);
+            if (iou >= iouThreshold) {
+                iouValid = true;
+            }
+        }
+    }
+    // This is the start of a span if the current confidence is above threshold, but the quad to the left is either below threshold,
+    // or the IOU between the quads is below threshold
+    const bool isStartOfSpan = quadValid && !iouValid;
+    uint32_t label = isStartOfSpan;
+    // All labels start out as 0 or 1, and we'll then do a cumsum over the warp, which gives each thread an assigned label
+    // We also know that the final thread also contains the number of labels.
+    #pragma unroll
+    for (uint32_t offset = 1; offset < 32; offset <<= 1) {
+        auto inc = __shfl_up_sync(ALL_MASK, label, offset);
+        if (threadRank >= offset) {
+            label += inc;
+        }
+    }
+    // Before we zero out invalid labels, get the total number of labels
+    const uint32_t numLabels = __shfl_sync(ALL_MASK, label, WARP_SIZE - 1);
+    // Zero out the label if the current quad isn't valid
+    label = quadValid ? label : 0;
+    T* accumPtr = reinterpret_cast<T*>(&qAccum);
+    // Reduce all of the quads s.t. the left-most position in the span contains the full quad.
+    // We use `label` to decide whether to do the accumulation
+    #pragma unroll
+    for (uint32_t offset = 1; offset < 32; offset <<= 1) {
+        const auto otherLabel = __shfl_down_sync(ALL_MASK, label, offset);
+        // Regardless of whether the labels match, all threads in the warp must make the shfl_down
+        // call. So we use factor to modulate whether the given merge is valid
+        const T factor = otherLabel == label && offset + threadRank < WARP_SIZE ? 1.0f : 0.0f;
+        #pragma unroll
+        for (uint32_t i = 0; i < 10; ++i) {
+            accumPtr[i] += factor * __shfl_down_sync(ALL_MASK, accumPtr[i], offset);
+        }
+    }
+    // Elect thread-0 to figure out where to store the results
+    uint32_t storeOff = 0;
+    if (threadRank == 0) {
+        storeOff = atomicAdd(&allOutCounts[b], numLabels);
+    }
+    // Broadcast that offset to the whole warp
+    storeOff = __shfl_sync(ALL_MASK, storeOff, 0);
+    auto outEmbedQuads = allOutEmbedQuads[b];
+    // Now write out each quad, but collectively
+    for (uint32_t procLabel = 1; procLabel <= numLabels; ++procLabel) {
+        // Discover the index of the start of each label span
+        ballot = __ballot_sync(ALL_MASK, procLabel == label);
+        // ffs will find the (1-based) index of the least significant bit in ballot.
+        // This just so happens to be the start of the span for the current label
+        uint32_t startIdx = __ffs(ballot) - 1;
+        const T* inT = reinterpret_cast<T*>(&qAccum);
+        MergeQuad_<T> outQuad;
+        T* outT = reinterpret_cast<T*>(&outQuad);
+        #pragma unroll
+        for (uint32_t i = 0; i < 10; ++i) {
+            outT[i] = __shfl_sync(ALL_MASK, inT[i], startIdx);
+        }
+        write_embed_quad(outEmbedQuads, outQuad, storeOff + procLabel - 1);
+#ifdef NMS_VERIFY_CORRECTNESS
+        if (threadRank == 0) {
+            allOutIds[b][storeOff + procLabel - 1] = r * 32 + startIdx;
+        }
+#endif
+    }
+    if (threadRank == 0) {
+        // Increment the total number of quads by the number encountered on this row
+        atomicAdd(&allOutCounts[B], numLabels);
+    }
+#undef threadRank
+}
+template<bool IsSingleExample, typename T>
+__global__
+void device_a2a_adjacency_sparse(const uint64_t punCounts,
+                                 T iouThreshold,
+                                 torch::PackedTensorAccessor64<T, 3> embedQuads,
+                                 torch::PackedTensorAccessor64<bool, 2> outIsStart,
+                                 torch::PackedTensorAccessor64<int32_t, 2> outAdjCounts,
+                                 torch::PackedTensorAccessor64<int32_t, 3> outSparseAdj)
+{
+    const uint32_t b = blockIdx.y;
+    const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+    const int32_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t row = jobIdx / quadCt;
+    const int32_t col = jobIdx % quadCt;
+    // Only compute the upper triangular portion of the matrix
+    if (row >= quadCt || col < row) {
+        return;
+    }
+    T* exData = IsSingleExample ? embedQuads.data() : embedQuads[b].data();
+    const auto qRow = StridedEmbedQuad_<T>{ exData + row * embedQuads.stride(2), embedQuads.stride(1) }.Bounds(),
+               qCol = StridedEmbedQuad_<T>{ exData + col * embedQuads.stride(2), embedQuads.stride(1) }.Bounds();
+    T pctRow, pctCol, iou;
+    thrust::tie(pctRow, pctCol, iou) = geometry_region_sizes(qRow, qCol);
+    auto warpGroup = cg::tiled_partition<32>(cg::this_thread_block());
+    auto rowGroup = cg::labeled_partition(warpGroup, row);
+    const bool isValid = iou >= iouThreshold;
+    const uint32_t ballot = rowGroup.ballot(isValid);
+    const uint32_t numValid = __popc(ballot);
+    auto exAdjCounts = outAdjCounts[b].data();
+    int32_t storeOff = 0;
+    if (numValid > 0 && rowGroup.thread_rank() == 0) {
+        storeOff = atomicAdd(exAdjCounts + row, numValid);
+    }
+    storeOff = rowGroup.shfl(storeOff, 0);
+    if (isValid) {
+        // This will set all of the bits to the left of this one to 1, otherwise 0.
+        // We can use this to count the number of bits that are set, and are less significant than this one,
+        // to get the local storage offset
+        uint32_t lowerMask = (1 << rowGroup.thread_rank()) - 1;
+        storeOff += __popc(ballot & lowerMask);
+        outSparseAdj[b][row][storeOff] = col;
+        if (row != col) {
+            // Because `col` gets merged into `row`, we mark it as inactive for reduction purposes.
+            // All of the quads that `col` is adjacent to will be absorbed by `row`.
+            outIsStart[b][col] = false;
+            // Also store the transposed relation
+            storeOff = atomicAdd(exAdjCounts + col, 1);
+            outSparseAdj[b][col][storeOff] = row;
+        }
+    } else if (pctRow > 0.8f || pctCol > 0.8f) {
+        T anchorHeight = qRow.Height();
+        T otherHeight = qCol.Height();
+        T ratio = anchorHeight > otherHeight ?
+            otherHeight / anchorHeight :
+            anchorHeight / otherHeight;
+        if (ratio > 0.9f) {
+            if (pctRow > 0.8f) {
+                // Other envelops anchor
+                outIsStart[b][row] = false;
+            }
+            else {
+                outIsStart[b][col] = false;
+            }
+        }
+    }
+}
+template<uint32_t NumWarps, bool IsSingleExample, typename T, int32_t I_CELL_SIZE>
+__global__
+void device_a2a_adjacency_build_grid(const uint64_t punCounts,
+                                     torch::PackedTensorAccessor64<T, 3> embedQuads,
+                                     torch::PackedTensorAccessor64<int32_t, 4> outGridCells,
+                                     torch::PackedTensorAccessor64<int32_t, 3> outQuadCells)
+{
+    constexpr T MIN_T = std::numeric_limits<T>::min();
+    constexpr T MAX_T = std::numeric_limits<T>::max();
+    constexpr uint32_t WARP_SIZE = 32;
+    constexpr uint32_t BLOCK_SIZE = NumWarps * WARP_SIZE;
+    constexpr uint32_t FULL_WARP = 0xFFFFFFFF;
+    constexpr uint32_t FIRST_16_THREADS = 0x0FFFF;
+    constexpr T CELL_SIZE = I_CELL_SIZE;
+    constexpr T INV_CELL_SIZE = 1 / CELL_SIZE;
+    const uint32_t b = blockIdx.z;
+    const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+    const uint32_t quadIdx = blockIdx.y;
+    if (!IsSingleExample && quadIdx >= quadCt) {
+        return;
+    }
+    const uint32_t threadRank = threadIdx.x;
+    const uint32_t localThreadRank = threadRank & 0x1F;
+    auto exQuads = embedQuads[b];
+    const uint32_t numCells[2] = { outGridCells.size(2), outGridCells.size(1) };
+    const uint32_t numRows = outGridCells.size(1);
+    const uint32_t numCols = outGridCells.size(2);
+    // We use flip so that we can compute min and max simultaneously.
+    // First 4 threads compute the min, next 4 compute the max
+    T sign = localThreadRank < 8 ? 1.0f : -1.0f;
+    T myVal = sign * (localThreadRank < 16 ? exQuads[localThreadRank & 0x7][quadIdx] : MIN_T);
+    #pragma unroll
+    for (uint32_t offset = 2; offset < 8; offset <<= 1) {
+        T nextVal = __shfl_down_sync(FIRST_16_THREADS, myVal, offset);
+        myVal = min(myVal, nextVal);
+    }
+    const uint32_t cellVal = max(0.0f, sign * INV_CELL_SIZE * myVal);
+    uint32_t minCell[2] = { __shfl_sync(FULL_WARP, cellVal, 0), __shfl_sync(FULL_WARP, cellVal, 1) },
+             maxCell[2] = { __shfl_sync(FULL_WARP, cellVal, 8), __shfl_sync(FULL_WARP, cellVal, 9) };
+    #pragma unroll
+    for (uint32_t i = 0; i < 2; ++i) {
+        maxCell[i] = min(numCells[i] - 1, maxCell[i]);
+    }
+    const uint32_t sizes[2] = { maxCell[0] - minCell[0] + 1, maxCell[1] - minCell[1] + 1 };
+    const uint32_t totalCells = sizes[0] * sizes[1];
+    auto exGridCells = outGridCells[b];
+    for (uint32_t i = threadRank; i < totalCells; i += BLOCK_SIZE) {
+        uint32_t row = minCell[1] + i / sizes[0];
+        uint32_t col = minCell[0] + i % sizes[0];
+        int32_t *pCell = exGridCells[row][col].data();
+        // The first value in the array is the count, and the rest are the quad indices
+        int32_t storeOff = atomicAdd(pCell, 1) + 1;
+        pCell[storeOff] = quadIdx;
+    }
+    if (threadRank < 2) {
+        outQuadCells[b][quadIdx][threadRank] = minCell[threadRank];
+    } else if (threadRank < 4) {
+        outQuadCells[b][quadIdx][threadRank] = maxCell[threadRank - 2];
+    }
+}
+typedef uint8_t visit_mask_t;
+template<uint32_t NumWarps, bool IsSingleExample, typename T>
+__global__
+void device_a2a_adjacency_with_grid(const uint64_t punCounts,
+                                    T iouThreshold,
+                                    torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
+                                    torch::PackedTensorAccessor64<int32_t, 4> allCells,
+                                    torch::PackedTensorAccessor64<int32_t, 3> allQuadExtents,
+                                    torch::PackedTensorAccessor64<bool, 2> outIsStart,
+                                    torch::PackedTensorAccessor64<int32_t, 2> outAdjCounts,
+                                    torch::PackedTensorAccessor64<int32_t, 3> outSparseAdj)
+{
+    constexpr T MIN_T = std::numeric_limits<T>::min();
+    constexpr T MAX_T = std::numeric_limits<T>::max();
+    constexpr uint32_t WARP_SIZE = 32;
+    constexpr uint32_t BLOCK_SIZE = NumWarps * WARP_SIZE;
+    const uint32_t b = blockIdx.z;
+    const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+    const uint32_t quadIdx = blockIdx.y;
+    if (!IsSingleExample && quadIdx >= quadCt) {
+        return;
+    }
+    const uint32_t threadRank = threadIdx.x;
+    auto exQuads = allEmbedQuads[b];
+    __shared__ T s_quadVerts[8];
+    __shared__ uint32_t s_quadExtent[4];
+    extern __shared__ uint32_t s_alreadyVisited[];
+    if (threadRank < 8) {
+        s_quadVerts[threadRank] = exQuads[threadRank][quadIdx];
+    } else if (threadRank < 12) {
+        s_quadExtent[threadRank - 8] = reinterpret_cast<uint32_t*>(allQuadExtents[b][quadIdx].data())[threadRank - 8];
+    }
+    uint32_t zeroTerm = (quadCt + 31u) >> 5u; // Fast version of div_up(quadCt, 32)
+    for (uint32_t col = threadRank; col < zeroTerm; col += BLOCK_SIZE) {
+        s_alreadyVisited[col] = 0;
+    }
+    __syncthreads();
+    auto exCells = allCells[b];
+    auto exAdjCounts = reinterpret_cast<uint32_t*>(outAdjCounts[b].data());
+    auto exAdjValues = outSparseAdj[b][quadIdx].data();
+    T *exData = IsSingleExample ? allEmbedQuads.data() : allEmbedQuads[b].data();
+    const auto bdsAnchor = Quad_<T>{ s_quadVerts }.Bounds();
+    const uint32_t startCol = s_quadExtent[0],
+                   endCol = s_quadExtent[2];
+    for (uint32_t row = s_quadExtent[1], endRow = s_quadExtent[3]; row <= endRow; ++row) {
+        auto rowCells = exCells[row];
+        for (uint32_t col = startCol; col <= endCol; ++col) {
+            auto colCells = reinterpret_cast<const uint32_t*>(rowCells[col].data());
+            const uint32_t ct = colCells[0];
+            for (uint32_t i = threadRank + 1; i <= ct; i += BLOCK_SIZE) {
+                const uint32_t otherIdx = colCells[i];
+                const uint32_t maskIdx = otherIdx >> 5; // Divide by 32, since there are 32 bits per mask slot
+                const uint32_t maskBit = 1 << (otherIdx & 0x1F); // Set the relevant bit for this mask ID
+                const bool alreadyVisited = atomicOr(s_alreadyVisited + maskIdx, maskBit) & maskBit;
+                if (!alreadyVisited) {
+                    const auto bdsOther = StridedEmbedQuad_<T>{ exData + otherIdx * allEmbedQuads.stride(2), allEmbedQuads.stride(1) }.Bounds();
+                    T pctAnchor, pctOther, iou;
+                    thrust::tie(pctAnchor, pctOther, iou) = geometry_region_sizes(bdsAnchor, bdsOther);
+                    if (iou >= iouThreshold) {
+                        auto validGroup = cg::coalesced_threads();
+                        uint32_t storeOff = 0;
+                        if (validGroup.thread_rank() == 0) {
+                            storeOff = atomicAdd(exAdjCounts + quadIdx, validGroup.size());
+                        }
+                        storeOff = validGroup.shfl(storeOff, 0) + validGroup.thread_rank();
+                        exAdjValues[storeOff] = otherIdx;
+                        if (otherIdx > quadIdx) {
+                            outIsStart[b][otherIdx] = false;
+                        }
+                    } else if (pctAnchor > 0.8f || pctOther > 0.8f) {
+                        T anchorHeight = bdsAnchor.Height();
+                        T otherHeight = bdsOther.Height();
+                        T ratio = anchorHeight > otherHeight ?
+                                        otherHeight / anchorHeight :
+                                        anchorHeight / otherHeight;
+                        if (ratio > 0.9f) {
+                            if (pctAnchor > 0.8f) {
+                                // Other envelops anchor
+                                outIsStart[b][quadIdx] = false;
+                            } else {
+                                outIsStart[b][otherIdx] = false;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+template<bool IsSingleExample>
+__global__
+void device_flatten_graph_iterative(const uint64_t punCounts,
+                                    torch::PackedTensorAccessor64<bool, 2> allIsStart,
+                                    volatile uint32_t *allAdjCounts,
+                                    volatile uint32_t *allAdjValues
+#ifdef NMS_VERIFY_CORRECTNESS
+                                  , int32_t *maxDepth
+#endif
+                                    )
+{
+    constexpr uint32_t WARP_SIZE = 32;
+    constexpr uint32_t VISIT_STACK_SIZE = 9;
+    constexpr uint32_t TERM_VALUE = std::numeric_limits<uint32_t>::max();
+    constexpr visit_mask_t VISITED_MASK = 0b001;
+    constexpr visit_mask_t ADDED_MASK = 0b010;
+    constexpr visit_mask_t QUEUED_MASK = 0b100;
+    constexpr visit_mask_t QUEUED_OR_VISITED_MASK = VISITED_MASK | QUEUED_MASK;
+    const uint32_t b = blockIdx.z;
+    const uint32_t anchorRow = blockIdx.y;
+    const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+    // Only need to check this if there are multiple examples, since in the case of a single example,
+    // the grid is precisely sized to that quadCt
+    if constexpr (!IsSingleExample) {
+        if (anchorRow >= quadCt) {
+            return;
+        }
+    }
+    auto isStart = allIsStart[b].data();
+    const uint32_t threadRank = threadIdx.x;
+    extern __shared__ visit_mask_t s_visitedMask[];
+#ifndef NMS_VERIFY_CORRECTNESS
+    // Only need to process the anchor rows, since they're the only ones
+    // that will make it through the full NMS operation.
+    // NOTE: There's a race condition where some rows may be marked as anchor,
+    // but they'll later be marked non-anchor over the course of this kernel.
+    // That's fine. It's a bit of extra work, but there's no real way around it.
+    const bool anchorIsStart = isStart[anchorRow];
+    if (!anchorIsStart) {
+        return;
+    }
+#endif
+    uint32_t *pIntVisitedMask = reinterpret_cast<uint32_t*>(s_visitedMask);
+    uint32_t zeroTerm = (quadCt + 3) >> 2; // Fast version of div_up(quadCt, 4)
+    for (uint32_t col = threadRank; col < zeroTerm; col += blockDim.x) {
+        pIntVisitedMask[col] = 0;
+    }
+    __syncthreads();
+    const uint32_t maxExCount = allIsStart.size(1);
+    auto adjCounts = allAdjCounts + (b * maxExCount);
+    auto adjValues = allAdjValues + (b * maxExCount * maxExCount);
+    auto adjAnchorValues = adjValues + (anchorRow * maxExCount);
+    // For the anchor row, set the visited mask to 0b10, which will signify that we haven't visited it yet,
+    // but that the value is already in the adjacency vector.
+    // 0bx1 signifies that the value has been visited
+    for (uint32_t i = threadRank, ct = adjCounts[anchorRow]; i < ct; i += blockDim.x) {
+        const auto adjCol = adjAnchorValues[i];
+        s_visitedMask[adjCol] = ADDED_MASK;
+    }
+    __syncthreads();
+    if (threadRank == 0) {
+        s_visitedMask[anchorRow] |= QUEUED_MASK;
+    }
+    __syncthreads();
+    // TODO(mranzinger): Is it worth incorporating these other threads?
+    // It seems like the vast majority of adjacency counts is <32
+    if (threadRank >= WARP_SIZE) {
+        return;
+    }
+    uint32_t visitStack[VISIT_STACK_SIZE];
+    visitStack[0] = TERM_VALUE;
+    visitStack[1] = anchorRow;
+#ifndef NDEBUG
+    for (uint32_t i = 2; i < VISIT_STACK_SIZE; ++i) {
+        visitStack[i] = -2;
+    }
+#endif
+    int32_t visitPtr = 1;
+    while (true) {
+#ifdef NMS_VERIFY_CORRECTNESS
+        assert(visitPtr >= 0 && visitPtr < VISIT_STACK_SIZE);
+#endif
+        const uint32_t threadNextCol = visitStack[visitPtr];
+        const uint32_t warpNextCol = __reduce_min_full_warp(threadNextCol);
+        // Check to see if this thread got chosen.
+        // If so, decrement the stack counter
+        if (threadNextCol == warpNextCol) {
+#ifndef NDEBUG
+            // This makes it easier to debug where the pointer is
+            visitStack[visitPtr] = -2;
+#endif
+            --visitPtr;
+        }
+        // If the maximum value encountered is -1, that means that none of the threads
+        // had another value to process
+        if (warpNextCol == TERM_VALUE) {
+            break;
+        }
+        const uint32_t procRow = warpNextCol;
+        __syncthreads();
+        bool isAlreadyVisited = s_visitedMask[procRow] & VISITED_MASK;
+        if (isAlreadyVisited) {
+            continue;
+        }
+        const uint32_t procAdjCount = adjCounts[procRow];
+        auto procAdjValues = adjValues + (procRow * maxExCount);
+        // Offsetting by the iteration number will help balance out the maximum depth of any stack in the warp.
+        // The reason behind this is due to how otherwise, warp-0 will always get a new element, warp-1 iff the adj graph
+        // has more than one element, warp-2 iff the adj graph has more than two elements, and so on. Basically,
+        // the warps have decreasing pressure. With the rotation mechanism, it helps to balance out stack usage.
+        for (uint32_t i = threadRank; i < procAdjCount; i += WARP_SIZE) {
+            const uint32_t adjCol = procAdjValues[i];
+            // This will set the queued flag for this column, if it's not already set.
+            // It also returns the old state. In our case, we only want to add this value to the
+            // stack iff it hasn't already been visited, and hasn't been queued elsewhere
+            // NOTE: CUDA doesn't support atomicOr on uint8_t :(, but it's not necessary that
+            // the operation be absolutely atomic, so the poor man's version is probably okay
+            const auto oldMask = s_visitedMask[adjCol];
+            auto newMask = oldMask;
+            bool alreadyAdded = oldMask & ADDED_MASK;
+            auto group = cg::coalesced_threads();
+            const uint32_t gThreadRank = group.thread_rank();
+            uint32_t notAddedBallot = group.ballot(!alreadyAdded);
+            if (notAddedBallot) {
+                // Only one warp will ever be adding values to a given row, which means
+                // that we don't need atomics. However, other warps may be reading data
+                // from anchorRow, which means that we need to add the values first,
+                // followed by incrementing the count. This order makes things
+                // concurrency safe.
+                const uint32_t globalStoreOff = adjCounts[anchorRow];
+                // Gets the count of the bits to the left of this thread
+                const uint32_t localStoreOff = __popc(notAddedBallot & ((1 << gThreadRank) - 1));
+                if (!alreadyAdded) {
+                    adjAnchorValues[globalStoreOff + localStoreOff] = adjCol;
+                    if (adjCol > anchorRow) {
+                        // Also, ensure that this quad is no longer marked as a starting quad
+                        isStart[adjCol] = false;
+                    }
+                    newMask |= ADDED_MASK;
+                }
+                // Finally, commit the change by incrementing the counter
+                if (gThreadRank == 0) {
+                    adjCounts[anchorRow] += __popc(notAddedBallot);
+                }
+            }
+            bool alreadyHandled = oldMask & QUEUED_OR_VISITED_MASK;
+            if (!alreadyHandled) {
+#ifdef NMS_VERIFY_CORRECTNESS
+                newMask |= QUEUED_MASK;
+                ++visitPtr;
+                assert(visitPtr < VISIT_STACK_SIZE);
+                atomicMax(maxDepth, visitPtr);
+                visitStack[visitPtr] = adjCol;
+#else
+                // Prefer potentially inconsistent results over buffer overflow
+                if (visitPtr < VISIT_STACK_SIZE - 1) {
+                    newMask |= QUEUED_MASK;
+                    ++visitPtr;
+                    visitStack[visitPtr] = adjCol;
+                }
+#endif
+            }
+            if (newMask != oldMask) {
+                s_visitedMask[adjCol] = newMask;
+            }
+        }
+        // We actually rely on the `pop_next` function largely to handle recursing down into the next row
+        __syncthreads();
+    }
+}
+void add_to_set(const torch::TensorAccessor<int32_t, 1>& adjCounts,
+    const torch::TensorAccessor<int32_t, 2>& adjValues,
+    int32_t row,
+    std::unordered_set<int32_t>& possible)
+{
+    if (possible.count(row)) {
+        return;
+    }
+    possible.insert(row);
+    const int32_t adjCount = adjCounts[row];
+    auto values = adjValues[row].data();
+    for (int32_t i = 0; i < adjCount; ++i) {
+        const int32_t col = values[i];
+        add_to_set(adjCounts, adjValues, col, possible);
+    }
+}
+template<bool IsSingleExample>
+void cpu_flatten_graph(const uint64_t punCounts,
+                       torch::Tensor isStartTensorGPU,
+                       torch::Tensor adjCountsTensorGPU,
+                       torch::Tensor adjValuesTensorGPU)
+{
+    auto isStartTensor = isStartTensorGPU.cpu();
+    auto adjCountsTensor = adjCountsTensorGPU.cpu();
+    auto adjValuesTensor = adjValuesTensorGPU.cpu();
+    auto allIsStart = isStartTensor.accessor<bool, 2>();
+    auto allAdjCounts = adjCountsTensor.accessor<int32_t, 2>();
+    auto allAdjValues = adjValuesTensor.accessor<int32_t, 3>();
+    for (int32_t b = 0; b < allAdjCounts.size(0); ++b) {
+        const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+        for (int32_t row = 0; row < quadCt; ++row) {
+            std::unordered_set<int32_t> fullAdjSet;
+            add_to_set(allAdjCounts[b], allAdjValues[b], row, fullAdjSet);
+            int32_t &currCt = allAdjCounts[b][row];
+            int32_t *currValues = allAdjValues[b][row].data();
+            std::unordered_set<int32_t> existingSet{ currValues, currValues + currCt };
+            for (int32_t adjCol : fullAdjSet) {
+                if (existingSet.count(adjCol)) {
+                    continue;
+                }
+                currValues[currCt] = adjCol;
+                ++currCt;
+                if (adjCol > row) {
+                    allIsStart[b][adjCol] = false;
+                }
+            }
+        }
+    }
+    isStartTensorGPU.copy_(isStartTensor);
+    adjCountsTensorGPU.copy_(adjCountsTensor);
+    adjValuesTensorGPU.copy_(adjValuesTensor);
+}
+__global__
+void device_a2a_adj_cleanup(const int32_t *counts,
+                            torch::PackedTensorAccessor64<uint8_t, 3> inOutAdjacency)
+{
+    const uint32_t b = blockIdx.y;
+    const uint32_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint32_t numQuads = counts[b];
+    const uint32_t row = jobIdx / numQuads;
+    const uint32_t col = jobIdx % numQuads;
+    if (row >= numQuads) {
+        return;
+    }
+    auto adjacency = inOutAdjacency[b];
+    bool rowPivot = adjacency[row][row] > 0;
+    bool colPivot = adjacency[col][col] > 0;
+    if (!rowPivot || !colPivot) {
+        adjacency[row][col] = 0;
+    }
+}
+template<uint32_t NumWarps, typename T, bool IsSingleExample>
+__global__
+void device_a2a_collapse(const uint64_t punCounts,
+                         torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
+                         torch::PackedTensorAccessor64<bool, 2> allIsLeadRow,
+                         const int64_t *regionCounts,
+                         torch::PackedTensorAccessor64<int32_t, 2> allAdjCounts,
+                         torch::PackedTensorAccessor64<int32_t, 3> allAdjValues,
+                         //torch::PackedTensorAccessor64<int32_t, 2> allOutPositions,
+                         torch::PackedTensorAccessor64<T, 3> outQuads,
+                         T *outConf)
+{
+    constexpr uint32_t WARP_SIZE = 32;
+    constexpr uint32_t FULL_WARP = 0xFFFFFFFF;
+    constexpr uint32_t BLOCK_WIDTH = NumWarps * WARP_SIZE;
+    constexpr size_t MERGE_QUAD_SIZE = sizeof(MergeQuad_<T>) / sizeof(T);
+    static_assert(NumWarps < WARP_SIZE, "Only a single warp currently supported!");
+    const uint32_t b = blockIdx.z;
+    const uint32_t row = blockIdx.y;
+    const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+    if constexpr (!IsSingleExample) {
+        if (row >= quadCt) {
+            return;
+        }
+    }
+    // Only process the lead rows
+    const auto isLeadRow = IsSingleExample ? allIsLeadRow.data() : allIsLeadRow[b].data();
+    if (!isLeadRow[row]) {
+        return;
+    }
+    const uint32_t threadRank = threadIdx.x;
+    const uint32_t localThreadRank = threadRank & 0x1F;
+    const uint32_t warpIdx = threadRank >> 5;
+    __shared__ T s_mergeQuad[MERGE_QUAD_SIZE];
+    if constexpr (NumWarps > 1) {
+        if (threadRank < MERGE_QUAD_SIZE) {
+            s_mergeQuad[threadRank] = 0.0f;
+        }
+        __syncthreads();
+    }
+    T *exData = IsSingleExample ? allEmbedQuads.data() : allEmbedQuads[b].data();
+    const int32_t adjCount = allAdjCounts[b][row];
+    const int32_t *adjIdxs = allAdjValues[b][row].data();
+    MergeQuad_<T> localMerge{ZeroInitTag{}};
+    for (int32_t i = threadRank; i < adjCount; i += BLOCK_WIDTH) {
+        const int32_t currQuadIdx = adjIdxs[i];
+        const StridedEmbedQuad_<T> qCurr{ exData + currQuadIdx * allEmbedQuads.stride(2), allEmbedQuads.stride(1) };
+        localMerge.Append(qCurr);
+    }
+    T *mqV = reinterpret_cast<T*>(&localMerge);
+    #pragma unroll
+    for (uint32_t offset = 1; offset < WARP_SIZE; offset <<= 1) {
+        T mergeFactor = offset + localThreadRank < 32;
+        #pragma unroll
+        for (uint32_t i = 0; i < MERGE_QUAD_SIZE; ++i) {
+            mqV[i] += mergeFactor * __shfl_down_sync(FULL_WARP, mqV[i], offset);
+        }
+    }
+    #pragma unroll
+    for (uint32_t i = 0; i < MERGE_QUAD_SIZE; ++i) {
+        mqV[i] = __shfl_sync(FULL_WARP, mqV[i], 0);
+    }
+    // Only need to do a multi-warp merge if there are enough quads to justify it
+    if (NumWarps > 1 && adjCount > WARP_SIZE) {
+        if (localThreadRank < MERGE_QUAD_SIZE) {
+            atomicAdd(s_mergeQuad + localThreadRank, mqV[localThreadRank]);
+        }
+        __syncthreads();
+        mqV = s_mergeQuad;
+    }
+    // Figure out the output position
+    uint32_t writePosition = 0;
+    if constexpr (!IsSingleExample) {
+        for (int32_t i = threadRank; i < b; i += BLOCK_WIDTH) {
+            writePosition += regionCounts[i];
+        }
+    }
+    const int32_t numLongs = row >> 3; // Divide by 8
+    const uint8_t *pCurrIsLeadRow = reinterpret_cast<const uint8_t*>(isLeadRow);
+    const uint64_t *lpCurrIsLeadRow = reinterpret_cast<const uint64_t*>(pCurrIsLeadRow);
+    for (int32_t i = threadRank; i < numLongs; i += BLOCK_WIDTH) {
+        writePosition += __popcll(lpCurrIsLeadRow[i]);
+    }
+    for (int32_t i = (numLongs * 8) + threadRank; i < row; i += BLOCK_WIDTH) {
+        if (pCurrIsLeadRow[i]) {
+            ++writePosition;
+        }
+    }
+    // Sum all of the individual offsets over the warp
+    writePosition = __reduce_add_full_warp(writePosition);
+    // Reduce across warps, if applicable
+    if constexpr (NumWarps > 1) {
+        __shared__ uint32_t s_threadWritePositions[NumWarps];
+        if (localThreadRank == 0) {
+            s_threadWritePositions[warpIdx] = writePosition;
+        }
+        __syncthreads();
+        writePosition = threadRank < NumWarps ? s_threadWritePositions[threadRank] : 0;
+        writePosition = __reduce_add_full_warp(writePosition);
+    }
+    if (threadRank >= 9) {
+        return;
+    }
+    const T sumConfidence = mqV[8];
+    const T numQuads = mqV[9];
+    const T divisor = threadRank < 8 ? sumConfidence : numQuads;
+    const T myVal = mqV[threadRank] / divisor;
+    auto writeVerts = outQuads[writePosition].data();
+    if (threadRank < 8) {
+        writeVerts[threadRank] = myVal;
+    } else {
+        outConf[writePosition] = myVal;
+    }
+}
+struct CollapseRowsResult {
+    torch::Tensor ExCounts;
+    torch::Tensor StridedMergeQuads;
+    int32_t TotalNumQuads;
+    // NOTE: This will only be available in Debug builds
+    torch::Tensor QuadIds;
+    int32_t ImageWidth;
+    int32_t ImageHeight;
+};
+template<typename scalar_t>
+CollapseRowsResult collapse_rows(
+    torch::Tensor quads, torch::Tensor probs, scalar_t probThreshold, scalar_t iouThreshold
+)
+{
+    if (! quads.is_contiguous()) {
+        throw std::runtime_error("Expected `quads` to be contiguous!");
+    }
+    if ((quads.size(2) % 32) != 0) {
+        throw std::runtime_error("Expected the width of the `quads` buffer to be a multiple of 32!");
+    }
+    int32_t imageWidth = quads.size(2) * 4;
+    int32_t imageHeight = quads.size(1) * 4;
+    quads = quads.reshape({ quads.size(0), -1, 32, 4, 2 });
+    probs = probs.reshape({ probs.size(0), -1, 32 });
+    if (quads.size(0) != probs.size(0) || quads.size(1) != probs.size(1)) {
+        throw std::runtime_error("Dimension mismatch between `quads` and `probs`");
+    }
+    // The final counter is for the total number of quads for the entire batch
+    auto counts = torch::zeros({ quads.size(0) + 1 }, quads.options().dtype(torch::kInt32));
+    int64_t embedSize = sizeof(EmbedQuad_<scalar_t>) / sizeof(scalar_t);
+    auto rowMergeTensor = torch::empty({ quads.size(0), embedSize, quads.size(1) * quads.size(2) }, quads.options());
+#ifdef NMS_VERIFY_CORRECTNESS
+    auto idsTensor = torch::full({ quads.size(0), quads.size(1) * quads.size(2) },
+                                 std::numeric_limits<int32_t>::max(),
+                                 counts.options().dtype(torch::kInt32));
+#else
+    torch::Tensor idsTensor;
+#endif
+    dim3 blockSize(32, 3, 1);
+    dim3 gridSize(1,
+                  div_up(quads.size(1), blockSize.y),
+                  quads.size(0));
+    device_row_collapse KERNEL_ARG2(gridSize, blockSize) (
+        quads.packed_accessor64<scalar_t, 5>(),
+        probs.packed_accessor64<scalar_t, 3>(),
+        probThreshold, iouThreshold,
+        counts.packed_accessor64<int32_t, 1>(),
+        rowMergeTensor.packed_accessor64<scalar_t, 3>()
+#ifdef NMS_VERIFY_CORRECTNESS
+        , idsTensor.packed_accessor64<int32_t, 2>()
+#endif
+    );
+#ifdef NMS_VERIFY_CORRECTNESS
+    static std::unordered_set<int32_t> s_quadIds;
+    auto cpuIdsTensor = idsTensor.cpu();
+    const int32_t *idsPtr = cpuIdsTensor.data_ptr<int32_t>();
+    if (s_quadIds.empty()) {
+        s_quadIds.insert(idsPtr, idsPtr + idsTensor.numel());
+    } else {
+        std::unordered_set<int32_t> otherIds{ idsPtr, idsPtr + idsTensor.numel() };
+        if (s_quadIds != otherIds) {
+            throw std::runtime_error("Inconsistent Ids!");
+        }
+    }
+#endif
+    // The final value in `counts` is actually to total number of quads for the entire batch
+    int32_t totalQuads = counts[-1].item<int32_t>();
+    counts = counts.slice(/*dim=*/ 0, 0, counts.size(0) - 1);
+#ifdef NMS_VERIFY_CORRECTNESS
+    int64_t maxExCount;
+    if (counts.size(0) > 1) {
+        maxExCount = counts.max().item<int32_t>();
+    } else {
+        maxExCount = totalQuads;
+    }
+    static bool s_sortOrder = false;
+    rowMergeTensor = rowMergeTensor.slice(2, 0, maxExCount);
+    idsTensor = idsTensor.slice(1, 0, maxExCount);
+    auto order = torch::argsort(idsTensor, /*dim=*/ 1, s_sortOrder); s_sortOrder = !s_sortOrder;
+    auto embOrder = order.unsqueeze(1).expand_as(rowMergeTensor);
+    rowMergeTensor = torch::gather(rowMergeTensor, /*dim=*/ 2, embOrder);
+    idsTensor = torch::gather(idsTensor, /*dim=*/ 1, order);
+#endif
+    return { counts, rowMergeTensor, totalQuads, idsTensor, imageWidth, imageHeight };
+}
+void verify_row(const torch::TensorAccessor<int32_t, 1> &adjCounts,
+                const torch::TensorAccessor<int32_t, 2> &adjValues,
+                int32_t row)
+{
+    // Traverse the graph, and accumulate all set flags across all rows marked
+    // adjacent by the current row. If the merge_up algorithm works correctly, then
+    // `possible` will contain exactly the same set of values as the current row
+    std::unordered_set<int32_t> possible;
+    add_to_set(adjCounts, adjValues, row, possible);
+    std::unordered_set<int32_t> thisRow{ row };
+    const int32_t thisCount = adjCounts[row];
+    auto thisValues = adjValues[row].data();
+    thisRow.insert(thisValues, thisValues + thisCount);
+    if (thisRow != possible) {
+        throw std::runtime_error("The merge_up algorithm is not correct!");
+    }
+}
+struct AdjacencyResult {
+    // Shape: BxQ
+    // Specifies whether the given row is a result row
+    torch::Tensor IsLeadRow;
+    // Shape: BxQ
+    // The number of quads that need to be merged with the given quad
+    torch::Tensor AdjCounts;
+    // Shape: BxQx<Num Adjacent>
+    // The indices of the adjacent quads.
+    torch::Tensor AdjValues;
+    int64_t MaxExCount;
+};
+template<bool IsSingleExample, typename T>
+void cpu_a2a_adjacency_sparse(const uint64_t punCounts,
+                              const T iouThreshold,
+                              torch::Tensor embedQuadsTensor,
+                              torch::Tensor outIsStartTensorGPU,
+                              torch::Tensor outAdjCountsTensorGPU,
+                              torch::Tensor outSparseAdjTensorGPU)
+{
+    embedQuadsTensor = embedQuadsTensor.cpu();
+    auto outIsStartTensor = outIsStartTensorGPU.cpu();
+    auto outAdjCountsTensor = outAdjCountsTensorGPU.cpu();
+    auto outSparseAdjTensor = outSparseAdjTensorGPU.cpu();
+    auto embedQuads = embedQuadsTensor.accessor<T, 3>();
+    auto isStart = outIsStartTensor.accessor<bool, 2>();
+    auto adjCounts = outAdjCountsTensor.accessor<int32_t, 2>();
+    auto adjValues = outSparseAdjTensor.accessor<int32_t, 3>();
+    for (int32_t b = 0; b < embedQuadsTensor.size(0); ++b) {
+        const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
+        T *exData = embedQuads[b].data();
+        for (int32_t row = 0; row < quadCt; ++row) {
+            const auto qRow = StridedEmbedQuad_<T>{ exData + row, embedQuads.stride(1) }.Bounds();
+            for (int32_t col = 0; col < quadCt; ++col) {
+                const auto qCol = StridedEmbedQuad_<T>{ exData + col, embedQuads.stride(1) }.Bounds();
+                T pctRow, pctCol, iou;
+                thrust::tie(pctRow, pctCol, iou) = geometry_region_sizes(qRow, qCol);
+                if (iou >= iouThreshold) {
+                    int32_t &storeIdx = adjCounts[b][row];
+                    adjValues[b][row][storeIdx] = col;
+                    ++storeIdx;
+                    if (row < col) {
+                        isStart[b][col] = false;
+                    }
+                } else if (pctRow > 0.8f || pctCol > 0.8f) {
+                    T anchorHeight = qRow.Height();
+                    T otherHeight = qCol.Height();
+                    T ratio = anchorHeight > otherHeight ?
+                        otherHeight / anchorHeight :
+                        anchorHeight / otherHeight;
+                    if (ratio > 0.9f) {
+                        if (pctRow > 0.8f) {
+                            // Other envelops anchor
+                            isStart[b][row] = false;
+                        }
+                        else {
+                            isStart[b][col] = false;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    outIsStartTensorGPU.copy_(outIsStartTensor);
+    outAdjCountsTensorGPU.copy_(outAdjCountsTensor);
+    outSparseAdjTensorGPU.copy_(outSparseAdjTensor);
+}
+template<typename T>
+std::string to_flat_string(torch::Tensor tensor) {
+    tensor = tensor.flatten();
+    auto acc = tensor.accessor<T, 1>();
+    std::ostringstream oss;
+    oss << "[";
+    if (acc.size(0) > 0) {
+        oss << acc[0];
+        for (int64_t i = 1; i < acc.size(0); ++i) {
+            oss << ", " << acc[i];
+        }
+    }
+    oss << "]";
+    return oss.str();
+}
+template<typename scalar_t>
+AdjacencyResult compute_all_to_all_adjacency(
+    const CollapseRowsResult &collapseResult,
+    scalar_t iouThreshold)
+{
+    torch::Tensor counts = collapseResult.ExCounts;
+    int64_t maxExCount;
+    if (counts.size(0) > 1) {
+        maxExCount = counts.max().item<int32_t>();
+    } else {
+        maxExCount = collapseResult.TotalNumQuads;
+    }
+    auto isStartTensor = torch::ones({ counts.size(0), maxExCount }, counts.options().dtype(torch::kBool));
+    auto adjCountsTensor = torch::zeros({ counts.size(0), maxExCount }, counts.options().dtype(torch::kInt32));
+#ifndef NMS_VERIFY_CORRECTNESS
+    auto adjValuesTensor = torch::empty({ counts.size(0), maxExCount, maxExCount }, counts.options().dtype(torch::kInt32));
+#else
+    auto adjValuesTensor = torch::full({ counts.size(0), maxExCount, maxExCount },
+                                       5000,
+                                       counts.options().dtype(torch::kInt32));
+#endif
+    // If the batch is only a single example, instead of hitting global memory for the count, we can
+    // just encode the count into the pointer instead
+    uint64_t ptrCounts = reinterpret_cast<uint64_t>(counts.data_ptr<int32_t>());
+    if (counts.size(0) == 1) {
+        ptrCounts = maxExCount;
+    }
+#ifdef NMS_VERIFY_CORRECTNESS
+    auto cpuAdjValuesTensor = adjValuesTensor.cpu();
+    auto cpuAdjCountsTensor = adjCountsTensor.cpu();
+    auto cpuIsStartTensor = isStartTensor.cpu();
+#endif
+    size_t smemSize;
+    dim3 gridSize, blockSize;
+    ///////////////////
+    // NOTE(mranzinger): This algorithm uses a fixed sized grid to spatially subdivide the canvas. For virtually all test conditions
+    //                   I ran this through, it was slightly slower than the brute force approach that parallelizes better.
+    //                   It's possible that there is some number of words present (e.g. >500) where this algorithm becomes
+    //                   faster.
+    //
+    //constexpr int32_t CELL_SIZE = 100;
+    //constexpr int64_t NUM_BINS_PER_CELL = 200;
+    //int32_t numXCells = div_up(collapseResult.ImageWidth, CELL_SIZE);
+    //int32_t numYCells = div_up(collapseResult.ImageHeight, CELL_SIZE);
+    //auto gridCellsTensor = torch::zeros({ counts.size(0), numYCells, numXCells, NUM_BINS_PER_CELL }, adjCountsTensor.options());
+    //auto quadCellExtentsTensor = torch::empty({ counts.size(0), maxExCount, 4 }, gridCellsTensor.options());
+    //smemSize = div_up(static_cast<uint32_t>(maxExCount), 32);
+    //constexpr uint32_t GRID_NUM_WARPS = 3;
+    //blockSize = dim3{ GRID_NUM_WARPS * 32, 1, 1 };
+    //gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
+    //auto buildGridFn = counts.size(0) == 1 ?
+    //    device_a2a_adjacency_build_grid<GRID_NUM_WARPS, true, scalar_t, CELL_SIZE> :
+    //    device_a2a_adjacency_build_grid<GRID_NUM_WARPS, false, scalar_t, CELL_SIZE>;
+    //buildGridFn KERNEL_ARG2(gridSize, blockSize) (
+    //    ptrCounts,
+    //    collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
+    //    gridCellsTensor.packed_accessor64<int32_t, 4>(),
+    //    quadCellExtentsTensor.packed_accessor64<int32_t, 3>()
+    //);
+    //auto adjGridFn = counts.size(0) == 1 ?
+    //    device_a2a_adjacency_with_grid<GRID_NUM_WARPS, true, scalar_t> :
+    //    device_a2a_adjacency_with_grid<GRID_NUM_WARPS, false, scalar_t>;
+    //adjGridFn KERNEL_ARG3(gridSize, blockSize, smemSize) (
+    //    ptrCounts,
+    //    iouThreshold,
+    //    collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
+    //    gridCellsTensor.packed_accessor64<int32_t, 4>(),
+    //    quadCellExtentsTensor.packed_accessor64<int32_t, 3>(),
+    //    isStartTensor.packed_accessor64<bool, 2>(),
+    //    adjCountsTensor.packed_accessor64<int32_t, 2>(),
+    //    adjValuesTensor.packed_accessor64<int32_t, 3>()
+    //);
+    ///////////////////
+    uint32_t totalWork = maxExCount * maxExCount;
+    blockSize = dim3{96, 1};
+    gridSize = dim3{div_up(totalWork, blockSize.x),
+                    static_cast<uint32_t>(counts.size(0))};
+    auto adjFn = counts.size(0) == 1 ? device_a2a_adjacency_sparse<true, scalar_t> : device_a2a_adjacency_sparse<false, scalar_t>;
+    // This algorithm is O(n^2) with n being the current number of quads
+    adjFn KERNEL_ARG2(gridSize, blockSize) (
+        ptrCounts,
+        iouThreshold,
+        collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
+        isStartTensor.packed_accessor64<bool, 2>(),
+        adjCountsTensor.packed_accessor64<int32_t, 2>(),
+        adjValuesTensor.packed_accessor64<int32_t, 3>()
+    );
+#ifdef NMS_VERIFY_CORRECTNESS
+    cpu_a2a_adjacency_sparse<true>(ptrCounts, iouThreshold,
+        collapseResult.StridedMergeQuads, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
+    adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
+    assert(torch::all(cpuAdjCountsTensor == adjCountsTensor.cpu()).item<bool>());
+    assert(torch::all(cpuIsStartTensor == isStartTensor.cpu()).item<bool>());
+    assert(torch::all(cpuAdjValuesTensor == adjValuesTensor.cpu()).item<bool>());
+    std::cout << "\tA2A Is Start Count: " << isStartTensor.sum(torch::kInt32).item<int32_t>()
+              << ", Most Adjacent: " << adjCountsTensor.max().item<int32_t>() << std::endl;
+    auto maxDepthTensor = torch::tensor(0, adjCountsTensor.options());
+#endif
+    auto traverseFn = counts.size(0) == 1 ?
+                        device_flatten_graph_iterative<true> :
+                        device_flatten_graph_iterative<false>;
+    blockSize = dim3{ 128, 1, 1 };
+    gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
+    smemSize = div_up(maxExCount * sizeof(visit_mask_t), sizeof(uint32_t)) * sizeof(uint32_t);
+    traverseFn KERNEL_ARG3(gridSize, blockSize, smemSize) (
+        ptrCounts,
+        isStartTensor.packed_accessor64<bool, 2>(),
+        reinterpret_cast<uint32_t*>(adjCountsTensor.data_ptr<int32_t>()),
+        reinterpret_cast<uint32_t*>(adjValuesTensor.data_ptr<int32_t>())
+#ifdef NMS_VERIFY_CORRECTNESS
+      , maxDepthTensor.data_ptr<int32_t>()
+#endif
+    );
+#ifdef NMS_VERIFY_CORRECTNESS
+    cpu_flatten_graph<true>(ptrCounts, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
+    cpuAdjValuesTensor = std::get<0>(torch::sort(cpuAdjValuesTensor, /*dim=*/ 2));
+    adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
+    torch::Tensor diffStartIdxs = (cpuIsStartTensor != isStartTensor.cpu()).nonzero_numpy()[0];
+    assert(diffStartIdxs.numel() == 0);
+    torch::Tensor diffCountIdxs = (cpuAdjCountsTensor != adjCountsTensor.cpu()).nonzero_numpy()[0];
+    assert(diffCountIdxs.numel() == 0);
+    auto diffValuesTensor = torch::any(cpuAdjValuesTensor != adjValuesTensor.cpu(), /*dim=*/ 2, /*keepdim=*/ false).flatten().nonzero().flatten();
+    std::cout << "\t\tDiff Indices: " << to_flat_string<int64_t>(diffValuesTensor) << std::endl;
+    auto cpuDiffCountsTensor = cpuAdjCountsTensor.flatten().index({ diffValuesTensor });
+    auto cpuDiffRowsTensor = cpuAdjValuesTensor.flatten(0, 1).index({ diffValuesTensor });
+    auto gpuDiffRowsTensor = adjValuesTensor.cpu().flatten(0, 1).index({ diffValuesTensor });
+    for (int64_t i = 0, ct = cpuDiffRowsTensor.size(0); i < ct; ++i) {
+        auto z = cpuDiffCountsTensor[i].item<int32_t>();
+        auto diffRow = diffValuesTensor[i].item<int64_t>();
+        std::cout << "\t\tRow " << diffRow << std::endl;
+        std::cout << "\t\t\tExpected: " << to_flat_string<int32_t>(cpuDiffRowsTensor[i].slice(0, 0, z + 1)) << std::endl;
+        std::cout << "\t\t\t     GPU: " << to_flat_string<int32_t>(gpuDiffRowsTensor[i].slice(0, 0, z + 1)) << std::endl;
+    }
+    assert(diffValuesTensor.size(0) == 0);
+    std::cout << "\tA2A - Flatten - Is Start Count: " << isStartTensor.sum(torch::kInt32).item<int32_t>()
+         << ", Most Adjacent: " << adjCountsTensor.max().item<int32_t>()
+         << ", Max Depth: " << maxDepthTensor.item<int32_t>() << std::endl;
+    cpuIsStartTensor = isStartTensor.cpu();
+    cpuAdjCountsTensor = adjCountsTensor.cpu();
+    cpuAdjValuesTensor = adjValuesTensor.cpu();
+    auto cpuCounts = counts.cpu();
+    auto cpuCollapseIds = collapseResult.QuadIds.cpu();
+    static std::vector<std::unordered_set<int32_t>> s_knownGroups;
+    static std::unordered_map<int32_t, std::unordered_set<int32_t>> s_groupLookup;
+    std::vector<std::unordered_set<int32_t>> idGroups;
+    decltype(s_groupLookup) groupLookup;
+    for (int64_t b = 0; b < counts.size(0); ++b) {
+        int64_t quadCt = cpuCounts[b].item<int32_t>();
+        for (int64_t row = 0; row < quadCt; ++row) {
+            bool isLeadRow = cpuIsStartTensor[b][row].item<bool>();
+            auto bCountsTensor = cpuAdjCountsTensor[b];
+            auto bValuesTensor = cpuAdjValuesTensor[b];
+            auto bCounts = bCountsTensor.accessor<int32_t, 1>();
+            auto bValues = bValuesTensor.accessor<int32_t, 2>();
+            auto bIdsTensor = cpuCollapseIds[b];
+            auto bIds = bIdsTensor.accessor<int32_t, 1>();
+            std::unordered_set<int32_t> sIds;
+            for (int32_t i = 0, ct = bCounts[row]; i < ct; ++i) {
+                int32_t col = bValues[row][i];
+                int32_t id = bIds[col];
+                sIds.insert(id);
+            }
+            if (sIds.empty()) {
+                throw std::runtime_error("The ids tensor is empty!");
+            }
+            groupLookup[bIds[row]] = sIds;
+            if (isLeadRow) {
+                verify_row(bCounts, bValues, row);
+                idGroups.push_back(move(sIds));
+            }
+        }
+    }
+    if (s_knownGroups.empty()) {
+        s_knownGroups = move(idGroups);
+        s_groupLookup = move(groupLookup);
+    } else {
+        // Make a copy
+        auto remOrigGroups = s_knownGroups;
+        auto remOrigGroupLookup = s_groupLookup;
+        std::vector<int32_t> quadIds;
+        for (auto &kv : remOrigGroupLookup) {
+            quadIds.push_back(kv.first);
+        }
+        for (int32_t qId : quadIds) {
+            assert(groupLookup.count(qId));
+        }
+        assert(groupLookup.size() == remOrigGroupLookup.size());
+        for (int32_t qId : quadIds) {
+            auto &oldGroup = remOrigGroupLookup[qId];
+            auto &newGroup = groupLookup[qId];
+            if (oldGroup == newGroup) {
+                remOrigGroupLookup.erase(qId);
+                groupLookup.erase(qId);
+            } else {
+                throw std::runtime_error("Group mismatch!");
+            }
+        }
+        for (int i = idGroups.size() - 1; i >= 0; --i) {
+            for (int j = remOrigGroups.size() - 1; j >= 0; --j) {
+                auto &idGroup = idGroups[i];
+                auto &knownGroup = remOrigGroups[j];
+                if (idGroup == knownGroup) {
+                    idGroups.erase(begin(idGroups) + i);
+                    remOrigGroups.erase(begin(remOrigGroups) + j);
+                    break;
+                }
+            }
+        }
+        if (!idGroups.empty() || !remOrigGroups.empty()) {
+            auto group_str = [] (auto &group) {
+                std::vector<int32_t> vGroup{ std::begin(group), std::end(group) };
+                std::sort(std::begin(vGroup), std::end(vGroup));
+                auto id_str = [] (int32_t id) {
+                    std::ostringstream oss;
+                    //oss << "(" << (id / 32) << ", " << (id % 32) << ")";
+                    oss << id;
+                    return oss.str();
+                };
+                std::ostringstream oss;
+                oss << "[" << id_str(vGroup[0]);
+                for (size_t i = 1; i < vGroup.size(); ++i) {
+                    oss << ", " << id_str(vGroup[i]);
+                }
+                oss << "]";
+                return oss.str();
+            };
+            std::cout << "\tEncountered a difference in groups!" << std::endl
+                 << "\t\tOrig groups:" << std::endl;
+            for (auto &group : remOrigGroups) {
+                std::cout << "\t\t\t" << group_str(group) << std::endl;
+            }
+            std::cout << "\t\tNew groups:" << std::endl;
+            for (auto &group : idGroups) {
+                std::cout << "\t\t\t" << group_str(group) << std::endl;
+            }
+        }
+    }
+#endif
+    return { isStartTensor, adjCountsTensor, adjValuesTensor, maxExCount };
+}
+template<typename scalar_t>
+nms_result_t
+    all_to_all_collapse(
+        const CollapseRowsResult &collapseRowsRes,
+        const AdjacencyResult &adjResult)
+{
+    auto counts = collapseRowsRes.ExCounts;
+    auto embedQuads = collapseRowsRes.StridedMergeQuads;
+    if (!embedQuads.is_contiguous()) {
+        throw std::runtime_error("Input embed quads were not contiguous!");
+    }
+    torch::Tensor isLeadRow;
+    if (counts.size(0) == 1) {
+        isLeadRow = adjResult.IsLeadRow;
+    } else {
+        // For multiple examples: IsLeadRow will have true values beyond the extent of the number of quads
+        // However, we know that Counts > 0 only happen within the extent, so the set intersection
+        // tells us which rows are actually lead
+        isLeadRow = torch::logical_and(adjResult.IsLeadRow, adjResult.AdjCounts > 0);
+    }
+    auto regionCounts = isLeadRow.sum(/*dim=*/ 1, /*keepdim=*/ false, torch::kInt64);
+    const int64_t numOutQuads = counts.size(0) == 1 ? regionCounts.item<int64_t>() : regionCounts.sum().item<int64_t>();
+    constexpr int32_t NUM_WARPS = 4;
+    dim3 blockSize(NUM_WARPS * 32, 1, 1);
+    dim3 gridSize(1, adjResult.MaxExCount, counts.size(0));
+    // If the batch is only a single example, instead of hitting global memory for the count, we can
+    // just encode the count into the pointer instead
+    uint64_t ptrCounts = reinterpret_cast<uint64_t>(counts.data_ptr<int32_t>());
+    if (counts.size(0) == 1) {
+        ptrCounts = adjResult.MaxExCount;
+    }
+    torch::Tensor outQuads = torch::empty({ numOutQuads, 4, 2 }, embedQuads.options());
+    torch::Tensor outConf = torch::empty({ numOutQuads }, embedQuads.options());
+    auto collapseFn = counts.size(0) == 1 ?
+        device_a2a_collapse<NUM_WARPS, scalar_t, true> :
+        device_a2a_collapse<NUM_WARPS, scalar_t, false>;
+    collapseFn KERNEL_ARG2(gridSize, blockSize) (
+        ptrCounts,
+        embedQuads.packed_accessor64<scalar_t, 3>(),
+        isLeadRow.packed_accessor64<bool, 2>(),
+        regionCounts.data_ptr<int64_t>(),
+        adjResult.AdjCounts.packed_accessor64<int32_t, 2>(),
+        adjResult.AdjValues.packed_accessor64<int32_t, 3>(),
+        outQuads.packed_accessor64<scalar_t, 3>(),
+        outConf.data_ptr<scalar_t>()
+    );
+    return { outQuads, outConf, regionCounts };
+}
+template<typename scalar_t>
+nms_result_t cuda_quad_non_maximal_suppression_impl(
+    torch::Tensor quads, torch::Tensor probs,
+    scalar_t probThreshold, scalar_t iouThreshold,
+    int64_t maxRegions, bool verbose)
+{
+    static const bool s_timerEnabled = true;
+    static const bool s_verboseLevel2 = true;
+    // Make sure there's a batch dimension
+    if (quads.dim() == 4) {
+        // B,H,W,V,2
+        quads = quads.unsqueeze(0);
+        // B,H,W
+        probs = probs.unsqueeze(0);
+    }
+    //print_tensor_vec_stats2("NMS Input (quads, probs): ", { quads, probs });
+    double msRowCollapse = -1,
+           msAdjacency = -1,
+           msA2ACollapse = -1,
+           msTotal = -1;
+    CollapseRowsResult collapseRows;
+    AdjacencyResult adjacency;
+    torch::Tensor retQuads, retConf, regionCounts;
+    {
+        CudaStoreTimer tTotal{msTotal, s_timerEnabled};
+        {
+            CudaStoreTimer t{msRowCollapse, s_timerEnabled && verbose && s_verboseLevel2};
+            // First combine all of the quads in each row
+            collapseRows = collapse_rows(quads, probs, probThreshold, iouThreshold);
+            if (collapseRows.TotalNumQuads == 0) {
+                return {
+                    torch::empty({ 0, 4, 2 }, quads.options()),
+                    torch::empty({ 0 }, probs.options()),
+                    collapseRows.ExCounts.toType(torch::kInt64)
+                };
+            }
+        }
+        {
+            CudaStoreTimer t{msAdjacency, s_timerEnabled && verbose && s_verboseLevel2};
+            adjacency = compute_all_to_all_adjacency(collapseRows, iouThreshold);
+        }
+        {
+            CudaStoreTimer t{msA2ACollapse, s_timerEnabled && verbose && s_verboseLevel2};
+            std::tie(retQuads, retConf, regionCounts) = all_to_all_collapse<scalar_t>(collapseRows, adjacency);
+        }
+    }
+#ifndef NDEBUG
+    assert(regionCounts.sum().item<int64_t>() == retQuads.size(0));
+#endif
+    //print_tensor_vec_stats2("    Full NMS (quads, conf, counts): ", { retQuads, retConf, retCounts });
+    if (s_timerEnabled && verbose) {
+        std::cout << "NMS Cuda " << retQuads.size(0)
+             << " - Row Collapse (" << quads.size(0) << ", " << quads.size(1) << ", " << quads.size(2) << ") - (" << collapseRows.TotalNumQuads << "): " << msRowCollapse << "ms"
+             << ", Adjacency (" << adjacency.AdjCounts.sum(torch::kInt32).item<int32_t>() << "): " << msAdjacency << "ms"
+             << ", A2A Collapse (" << retQuads.size(0) << "): " << msA2ACollapse << "ms"
+             << ", Total: " << msTotal << "ms"
+             << std::endl;
+    }
+    return { retQuads, retConf, regionCounts };
+}
+nms_result_t cuda_quad_non_maximal_suppression(
+    torch::Tensor quads, torch::Tensor probs,
+    float probThreshold, float iouThreshold,
+    int64_t kernelHeight, int64_t kernelWidth,
+    int64_t maxRegions, bool verbose)
+{
+    nms_result_t ret;
+    ret = cuda_quad_non_maximal_suppression_impl<float>(
+        quads.toType(torch::kFloat32), probs.toType(torch::kFloat32),
+        probThreshold, iouThreshold,
+        maxRegions, verbose
+    );
+    // AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    //     quads.scalar_type(),
+    //     "cuda_quad_non_maximal_suppression_impl",
+    //     ([&] {
+    //         ret = cuda_quad_non_maximal_suppression_impl<scalar_t>(
+    //             move(quads), move(probs),
+    //             probThreshold, iouThreshold,
+    //             maxRegions
+    //         );
+    //     })
+    // );
+    return ret;
+}

nemo-retriever-ocr/cpp/non_maximal_suppression/nms_common.h ADDED Viewed

	@@ -0,0 +1,227 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <memory>
+#include <vector>
+#include <unordered_set>
+#include "../geometry.h"
+#include "../cuda_intellisense.cuh"
+#include "strided_quad.h"
+std::vector<torch::Tensor> quad_nms_from_adjacency(
+    torch::Tensor quads, torch::Tensor probs, torch::Tensor adjacency,
+    float probThreshold, float iouThreshold,
+    int64_t maxRegions);
+template<typename T>
+struct EmbedQuad_ : public QuadBase_<T, EmbedQuad_<T> > {
+    Point_<T> Vertices[4];
+    T Confidence;
+    T NumQuads = 0;
+    __device__
+    EmbedQuad_(T confidence = 0)
+    {
+        Reset();
+        Confidence = confidence;
+    }
+    __device__
+    EmbedQuad_(const EmbedQuad_ &other) = default;
+    __device__
+    void swap(EmbedQuad_ &other) noexcept {
+        using std::swap;
+        for (size_t i = 0; i < 4; ++i) {
+            swap(Vertices[i], other.Vertices[i]);
+        }
+        SWAP(Confidence, other.Confidence);
+        SWAP(NumQuads, other.NumQuads);
+    }
+    __device__
+    EmbedQuad_(EmbedQuad_ &&other) : EmbedQuad_() {
+        other.swap(*this);
+    }
+    __device__
+    EmbedQuad_ &operator=(EmbedQuad_ other) {
+        other.swap(*this);
+        return *this;
+    }
+    __device__
+    void Append(const EmbedQuad_ &other) {
+        Append(other, other.Confidence, other.NumQuads);
+    }
+    template<typename Derived>
+    __device__
+    void Append(const QuadBase_<T, Derived> &q, T conf, T numQuads = 1) {
+        Confidence *= NumQuads;
+        if (Confidence > 0) {
+            for (size_t i = 0; i < 4; ++i) {
+                Vertices[i] *= Confidence;
+            }
+        }
+        Confidence += conf * numQuads;
+        auto qVertices = static_cast<const Derived *>(&q)->Vertices;
+        for (size_t i = 0; i < 4; ++i) {
+            Vertices[i] += conf * numQuads * qVertices[i];
+            Vertices[i] /= Confidence;
+        }
+        NumQuads += numQuads;
+        Confidence /= NumQuads;
+    }
+    __device__
+    void Prepare() {
+        // T factor = 1.0 / Confidence;
+        // for (size_t i = 0; i < 4; ++i) {
+        //     Vertices[i] *= factor;
+        // }
+        // Confidence /= numQuads;
+    }
+    __device__
+    void Reset() {
+        for (size_t i = 0; i < 4; ++i) {
+            Vertices[i] = Point_<T>{0, 0};
+        }
+        Confidence = 0.0f;
+        NumQuads = 0;
+    }
+    __device__
+    const Point_<T> &operator[](size_t v) const { return Vertices[v]; }
+    __device__
+    Point_<T> &operator[](size_t v) { return Vertices[v]; }
+};
+struct ZeroInitTag {};
+template<typename T>
+struct MergeQuad_ : public QuadBase_<T, MergeQuad_<T>> {
+    Point_<T> Vertices[4];
+    T Confidence;
+    T NumQuads;
+    MergeQuad_() = default;
+    __device__
+    MergeQuad_(ZeroInitTag) : Confidence(0), NumQuads(0) {
+        for (size_t i = 0; i < 4; ++i) {
+            Vertices[i] = Point_<T>{0, 0};
+        }
+    }
+    template<typename Derived>
+    __device__
+    void Append(const QuadBase_<T, Derived> &q, T conf) {
+        Confidence += conf;
+        ++NumQuads;
+        auto &d = static_cast<const Derived&>(q);
+        for (size_t i = 0; i < 4; ++i) {
+            Vertices[i] += conf * d[i];
+        }
+    }
+    __device__
+    void Append(const EmbedQuad_<T> &q) {
+        T qConf = q.NumQuads * q.Confidence;
+        Confidence += qConf;
+        NumQuads += q.NumQuads;
+        for (size_t i = 0; i < 4; ++i) {
+            Vertices[i] += qConf * q.Vertices[i];
+        }
+    }
+    __device__
+    void Append(const StridedEmbedQuad_<T> &q) {
+        const T numQuads = q.NumQuads();
+        const T qConf = numQuads * q.Confidence();
+        Confidence += qConf;
+        NumQuads += numQuads;
+        for (size_t i = 0; i < 4; ++i) {
+            Vertices[i] += qConf * q[i];
+        }
+    }
+    __device__
+    EmbedQuad_<T> Commit() {
+        EmbedQuad_<T> ret;
+        for (size_t i = 0; i < 4; ++i) {
+            ret.Vertices[i] = Vertices[i] / Confidence;
+        }
+        ret.Confidence = Confidence / NumQuads;
+        ret.NumQuads = NumQuads;
+        return ret;
+    }
+    __device__
+    const Point_<T> &operator[](size_t v) const { return Vertices[v]; }
+    __device__
+    Point_<T> &operator[](size_t v) { return Vertices[v]; }
+};
+template<typename T, typename Intermediate=float>
+__device__
+inline T triangle_root(T val)
+{
+    // It's easier to visualize this algorithm for a lower triangular matrix
+    // What we're trying to find is the `row` of a lower triangular matrix that a given `val` resides in.
+    // e.g. 0->0, 2->1, 4->2, etc.
+    //
+    // 0: 0
+    // 1: 1 2
+    // 2: 3 4 5
+    // 3: 6 7 8 9
+    //
+    // See https://math.stackexchange.com/questions/698961/finding-the-triangular-root-of-a-number for explanation
+    Intermediate numer = Intermediate(-1) + sqrt(Intermediate(1) + Intermediate(8) * Intermediate(val));
+    Intermediate denom = Intermediate(2);
+    Intermediate ret = floor(numer / denom);
+    return T(ret);
+}
+template<typename T>
+void visit_node(const std::vector<EmbedQuad_<T>> &allQuads, size_t quadIdx,
+                const std::vector<std::vector<size_t>> &adjIdxs, EmbedQuad_<T> &currQuad,
+                std::unordered_set<size_t> &visited)
+{
+    if (visited.count(quadIdx) > 0) return;
+    const EmbedQuad_<T> &vQuad = allQuads[quadIdx];
+    currQuad.Append(vQuad);
+    visited.insert(quadIdx);
+    for (size_t childIdx : adjIdxs[quadIdx]) {
+        visit_node(allQuads, childIdx, adjIdxs, currQuad, visited);
+    }
+}
+template<typename T, typename Derived, typename scalar_t>
+void copy_quad(const QuadBase_<T, Derived> &srcQuad, scalar_t *pDest)
+{
+    auto vertices = static_cast<const Derived*>(&srcQuad)->Vertices;
+    for (size_t i = 0; i < 4; ++i) {
+        const Point_<T> &v = vertices[i];
+        *pDest++ = v.X;
+        *pDest++ = v.Y;
+    }
+}

nemo-retriever-ocr/cpp/non_maximal_suppression/nms_kd_tree.h ADDED Viewed

	@@ -0,0 +1,449 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <memory>
+#include <vector>
+#include <stack>
+#include "../geometry.h"
+#define MODE_GEOMETRY 0x02ull
+#define MODE_CHILDREN 0x00ull
+#define DIM_X 0x0ull
+#define DIM_Y 0x1ull
+static const size_t INVALID_IDX = -1;
+template<typename T>
+struct NMS_BoundsWrapper
+{
+    typedef std::unique_ptr<NMS_BoundsWrapper> Ptr;
+    typedef AABB_<typename T::inner_type> bds_t;
+    size_t GeoIdx;
+    const T *Geometry;
+    bds_t Bounds;
+    NMS_BoundsWrapper(size_t geoIdx, const T *geometry) : GeoIdx(geoIdx), Geometry(geometry), Bounds(geometry->Bounds()) { }
+};
+template<typename T>
+class NMS_NodeAllocator;
+template<typename T>
+class NMS_KDTree;
+template<typename T>
+class NMS_BuildCache;
+template<typename T>
+class NMS_KDNode
+{
+    friend class NMS_KDTree<T>;
+public:
+    typedef NMS_BoundsWrapper<T> bds_t;
+    typedef std::unique_ptr<NMS_KDNode[]> UPtr;
+    typedef typename T::inner_type inner_type;
+    typedef std::vector<bds_t*> geo_vec_t;
+    typedef std::unique_ptr<geo_vec_t> geo_vec_ptr;
+    void Build(geo_vec_ptr geometries, const typename bds_t::bds_t &envelope,
+               NMS_NodeAllocator<T> &allocator, NMS_BuildCache<T> &buildCache);
+    template<typename Fn>
+    void FindIntersections(size_t geoIdx, const typename bds_t::bds_t &bds, const Fn &fn) const;
+private:
+    inline uintptr_t Dim() const { return reinterpret_cast<uintptr_t>(m_ptr) & 0x01ull; }
+    inline uintptr_t Mode() const { return reinterpret_cast<uintptr_t>(m_ptr) & 0x02ull; }
+    inline void Children(NMS_KDNode *&children, inner_type &splitPos) const
+    {
+        auto vPtr = Geometries();
+        splitPos = *reinterpret_cast<inner_type*>(vPtr);
+        children = reinterpret_cast<NMS_KDNode*>(vPtr + sizeof(inner_type));
+    }
+    inline uint8_t* Geometries() const
+    {
+        return reinterpret_cast<uint8_t*>(reinterpret_cast<uintptr_t>(m_ptr) & ~0x3ull);
+    }
+    inline void SetPtr(uint8_t *vPtr, uintptr_t mode, uintptr_t dim)
+    {
+        m_ptr = reinterpret_cast<uint8_t*>(
+            reinterpret_cast<uintptr_t>(vPtr) | mode | dim
+        );
+    }
+    void AssignGeometries(geo_vec_ptr geometries, NMS_BuildCache<T> &buildCache);
+    uint8_t *m_ptr;
+};
+template<typename T>
+class NMS_NodeAllocator
+{
+public:
+    typedef NMS_KDNode<T> node_t;
+    typedef typename node_t::inner_type inner_type;
+    NMS_NodeAllocator(size_t initialGuess = 512);
+    ~NMS_NodeAllocator();
+    void Get(size_t numNodes, NMS_KDNode<T> *&outNodes, inner_type *&outSplitPos, uint8_t *&outRawPtr);
+private:
+    std::vector<std::pair<size_t, uint8_t*>> m_buffers;
+    size_t m_offset;
+};
+template<typename T>
+class NMS_BuildCache
+{
+public:
+    typedef typename NMS_KDNode<T>::bds_t bds_t;
+    typedef std::unique_ptr<NMS_BuildCache> Ptr;
+    typedef std::vector<bds_t*> geo_vec_t;
+    typedef std::unique_ptr<geo_vec_t> geo_vec_ptr;
+    NMS_BuildCache(size_t initialSize);
+    ~NMS_BuildCache();
+    geo_vec_ptr Get(size_t sizeHint);
+    bds_t** GetRawBuffer(size_t numGeos, uint8_t *&rawPtr);
+    void Release(geo_vec_ptr buff);
+private:
+    std::stack<geo_vec_ptr> m_cache;
+    std::vector<std::pair<size_t, uint8_t*>> m_rawBuffers;
+    size_t m_rawOffset;
+};
+template<typename T>
+class NMS_KDTree
+{
+    typedef typename T::inner_type inner_type;
+    typedef NMS_BoundsWrapper<T> bds_t;
+    typedef NMS_KDNode<T> node_t;
+public:
+    NMS_KDTree();
+    ~NMS_KDTree();
+    void Build(const std::vector<T> &geometries);
+    template<typename Fn>
+    void FindIntersections(size_t geoIdx, const Fn &fn) const;
+    template<typename Fn>
+    void FindIntersections(const T &geo, const Fn &fn) const;
+private:
+    bds_t *m_wrappers;
+    NMS_NodeAllocator<T> m_allocator;
+    node_t m_root;
+    typename NMS_BuildCache<T>::Ptr m_buildCache;
+};
+template<typename T>
+NMS_KDTree<T>::NMS_KDTree()
+    : m_wrappers(nullptr)
+{
+    m_root.m_ptr = nullptr;
+}
+template<typename T>
+NMS_KDTree<T>::~NMS_KDTree()
+{
+    free(m_wrappers);
+}
+template<typename T>
+void NMS_KDTree<T>::Build(const std::vector<T> &geometries)
+{
+    if (geometries.empty()) {
+        m_root.m_ptr = nullptr;
+        return;
+    }
+    // Doing this so that we can perform placement-new on the array buffer, and thus
+    // can only perform a single memory allocation for all geometries at once
+    m_wrappers = reinterpret_cast<bds_t*>(malloc(sizeof(bds_t) * geometries.size()));
+    m_buildCache.reset(new NMS_BuildCache<T>(geometries.size()));
+    auto bdsGeos = m_buildCache->Get(geometries.size());
+    typename bds_t::bds_t envelope;
+    for (size_t i = 0; i < geometries.size(); ++i) {
+        // Placement new. Constructs the object in the place specified in the first (...)
+        new (m_wrappers + i) bds_t(i, &geometries[i]);
+        bdsGeos->push_back(m_wrappers + i);
+        if (i == 0) {
+            envelope = m_wrappers[i].Bounds;
+        } else {
+            envelope = envelope.Union(m_wrappers[i].Bounds);
+        }
+    }
+    m_root.Build(std::move(bdsGeos), envelope, m_allocator, *m_buildCache);
+}
+template<typename T>
+void NMS_KDNode<T>::Build(geo_vec_ptr geometries, const typename bds_t::bds_t &envelope,
+                          NMS_NodeAllocator<T> &allocator, NMS_BuildCache<T> &buildCache)
+{
+    static const size_t MAX_GEOMETRIES = 8;
+    if (geometries->size() <= MAX_GEOMETRIES) {
+        AssignGeometries(std::move(geometries), buildCache);
+    } else {
+        geo_vec_ptr leftGeos = buildCache.Get(geometries->size()),
+                    rightGeos = buildCache.Get(geometries->size());
+        inner_type szX = envelope[2] - envelope[0];
+        inner_type szY = envelope[3] - envelope[1];
+        int64_t dim = szX > szY ? 0 : 1;
+        auto emn = envelope[dim];
+        auto emx = envelope[dim + 2];
+        auto pivotPos = (emn + emx) / 2;
+        for (bds_t *g : *geometries) {
+            auto mn = g->Bounds[dim];
+            auto mx = g->Bounds[dim + 2];
+            if (mn < pivotPos) {
+                leftGeos->push_back(g);
+            }
+            if (mx > pivotPos) {
+                rightGeos->push_back(g);
+            }
+        }
+        if (leftGeos->size() == geometries->size() || rightGeos->size() == geometries->size()) {
+            AssignGeometries(std::move(geometries), buildCache);
+            buildCache.Release(std::move(leftGeos));
+            buildCache.Release(std::move(rightGeos));
+        } else {
+            buildCache.Release(std::move(geometries));
+            inner_type *nodeSplitPos;
+            uint8_t *nodeRawPtr;
+            NMS_KDNode *children;
+            allocator.Get(2, children, nodeSplitPos, nodeRawPtr);
+            SetPtr(nodeRawPtr, MODE_CHILDREN, dim);
+            *nodeSplitPos = pivotPos;
+            typename bds_t::bds_t leftEnv{envelope}, rightEnv{envelope};
+            // Set the max of the left envelope to the split plane
+            leftEnv[dim + 2] = pivotPos;
+            // Set the min of the right envelope to the split plane
+            rightEnv[dim] = pivotPos;
+            children[0].Build(std::move(leftGeos), leftEnv, allocator, buildCache);
+            children[1].Build(std::move(rightGeos), rightEnv, allocator, buildCache);
+        }
+    }
+}
+template<typename T>
+void NMS_KDNode<T>::AssignGeometries(geo_vec_ptr geometries, NMS_BuildCache<T> &buildCache)
+{
+    if (geometries->empty()) {
+        SetPtr(nullptr, MODE_GEOMETRY, 0);
+    } else {
+        uint8_t *vPtr;
+        bds_t **geoPtr = buildCache.GetRawBuffer(geometries->size(), vPtr);
+        std::copy(geometries->begin(), geometries->end(), geoPtr);
+        SetPtr(vPtr, MODE_GEOMETRY, 0);
+    }
+    buildCache.Release(std::move(geometries));
+}
+template<typename T>
+template<typename Fn>
+void NMS_KDTree<T>::FindIntersections(size_t geoIdx, const Fn &fn) const
+{
+    if (!m_wrappers) return;
+    auto &bds = m_wrappers[geoIdx].Bounds;
+    m_root.FindIntersections(geoIdx, bds, fn);
+}
+template<typename T>
+template<typename Fn>
+void NMS_KDTree<T>::FindIntersections(const T &geo, const Fn &fn) const
+{
+    if (!m_wrappers) return;
+    NMS_BoundsWrapper<T> bdsWrapper(INVALID_IDX, &geo);
+    m_root.FindIntersections(INVALID_IDX, bdsWrapper.Bounds, fn);
+}
+template<typename T>
+template<typename Fn>
+void NMS_KDNode<T>::FindIntersections(size_t geoIdx, const typename bds_t::bds_t &bds, const Fn &fn) const
+{
+    auto mode = Mode();
+    if (mode == MODE_GEOMETRY) {
+        auto *vPtr = Geometries();
+        size_t numGeos = *reinterpret_cast<size_t*>(vPtr);
+        bds_t **geoPtr = reinterpret_cast<bds_t**>(vPtr + sizeof(size_t));
+        bds_t **endPtr = geoPtr + numGeos;
+        for (; geoPtr != endPtr; ++geoPtr) {
+            const bds_t *child = *geoPtr;
+            // Don't compute this against self
+            if (geoIdx != INVALID_IDX && child->GeoIdx <= geoIdx) continue;
+            typename bds_t::bds_t::inner_type pctN, pctM, iou;
+            std::tie(pctN, pctM, iou) = geometry_region_sizes(bds, child->Bounds);
+            if (iou > 0) {
+                fn(child->GeoIdx, pctN, pctM, iou);
+            }
+        }
+    } else {
+        auto dim = Dim();
+        auto mn = bds[dim];
+        auto mx = bds[dim + 2];
+        NMS_KDNode *children;
+        inner_type splitPos;
+        Children(children, splitPos);
+        if (mn < splitPos) {
+            children[0].FindIntersections(geoIdx, bds, fn);
+        }
+        if (mx > splitPos) {
+            children[1].FindIntersections(geoIdx, bds, fn);
+        }
+    }
+}
+template<typename T>
+NMS_NodeAllocator<T>::NMS_NodeAllocator(size_t initialGuess)
+    : m_offset(0)
+{
+    size_t allocSize = initialGuess * (sizeof(inner_type) + 2 * sizeof(node_t));
+    auto ptr = reinterpret_cast<uint8_t*>(malloc(allocSize));
+    m_buffers.emplace_back(initialGuess, ptr);
+}
+template<typename T>
+NMS_NodeAllocator<T>::~NMS_NodeAllocator()
+{
+    for (auto &p : m_buffers) {
+        free(p.second);
+    }
+}
+template<typename T>
+void NMS_NodeAllocator<T>::Get(size_t numNodes, node_t *&outNodes, inner_type *&outSplitPos, uint8_t *&outRawPtr)
+{
+    auto &currBuff = m_buffers.back();
+    size_t rem = currBuff.first - m_offset;
+    size_t reqSize = sizeof(inner_type) + sizeof(node_t) * numNodes;
+    if (rem >= reqSize) {
+        outRawPtr = currBuff.second + m_offset;
+        outSplitPos = reinterpret_cast<inner_type*>(outRawPtr);
+        outNodes = reinterpret_cast<node_t*>(outRawPtr + sizeof(inner_type));
+        m_offset += reqSize;
+        return;
+    }
+    // Rounds up to the nearest factor of 2
+    size_t allocSize = (std::max(currBuff.first * 2, reqSize) + 1) & ~0x01ull;
+    auto ptr = reinterpret_cast<uint8_t*>(malloc(allocSize));
+    m_buffers.emplace_back(allocSize, ptr);
+    m_offset = 0;
+    Get(numNodes, outNodes, outSplitPos, outRawPtr);
+}
+template<typename T>
+NMS_BuildCache<T>::NMS_BuildCache(size_t initialSize)
+    : m_rawOffset(0)
+{
+    auto allocSize = sizeof(bds_t*) * initialSize * 2;
+    auto raw1 = reinterpret_cast<uint8_t*>(malloc(allocSize));
+    m_rawBuffers.emplace_back(allocSize, raw1);
+}
+template<typename T>
+NMS_BuildCache<T>::~NMS_BuildCache()
+{
+    for (auto &p : m_rawBuffers) {
+        free(p.second);
+    }
+}
+template<typename T>
+typename NMS_BuildCache<T>::geo_vec_ptr NMS_BuildCache<T>::Get(size_t sizeHint)
+{
+    geo_vec_ptr ret;
+    if (! m_cache.empty()) {
+        ret = std::move(m_cache.top());
+        m_cache.pop();
+        ret->clear();
+    } else {
+        ret.reset(new std::vector<bds_t*>);
+    }
+    ret->reserve(sizeHint);
+    return ret;
+}
+template<typename T>
+typename NMS_BuildCache<T>::bds_t** NMS_BuildCache<T>::GetRawBuffer(size_t numGeos, uint8_t *&rawPtr)
+{
+    auto &currBuff = m_rawBuffers.back();
+    size_t rem = currBuff.first - m_rawOffset;
+    size_t reqSize = sizeof(size_t) + sizeof(bds_t*) * numGeos;
+    if (rem >= reqSize) {
+        rawPtr = currBuff.second + m_rawOffset;
+        m_rawOffset += reqSize;
+        reinterpret_cast<size_t*>(rawPtr)[0] = numGeos;
+        return reinterpret_cast<bds_t**>(rawPtr + sizeof(size_t));
+    }
+    size_t allocSize = (std::max(currBuff.first * 2, reqSize) + 1) & ~0x01ull;
+    auto ptr = reinterpret_cast<uint8_t*>(malloc(allocSize));
+    m_rawBuffers.emplace_back(allocSize, ptr);
+    m_rawOffset = 0;
+    return GetRawBuffer(numGeos, rawPtr);
+}
+template<typename T>
+void NMS_BuildCache<T>::Release(geo_vec_ptr buff)
+{
+    m_cache.push(std::move(buff));
+}
+#undef MODE_GEOMETRY
+#undef MODE_CHILDREN
+#undef DIM_X
+#undef DIM_Y