NVIDIA · mdboom · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -20,23 +20,16 @@ cache_runtime
 cache_nvrtc
 
 # CUDA Python specific (auto-generated)
-cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd
-cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx
-cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd
-cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx
+cuda_bindings/cuda/bindings/_internal/cudla.pyx
 cuda_bindings/cuda/bindings/_internal/driver.pyx
 cuda_bindings/cuda/bindings/_internal/nvrtc.pyx
 cuda_bindings/cuda/bindings/_internal/cufile.pyx
 cuda_bindings/cuda/bindings/_internal/nvfatbin.pyx
 cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
 cuda_bindings/cuda/bindings/_internal/nvml.pyx
 cuda_bindings/cuda/bindings/_internal/nvvm.pyx
-cuda_bindings/cuda/bindings/cyruntime.pxd
-cuda_bindings/cuda/bindings/cyruntime.pyx
-cuda_bindings/cuda/bindings/cyruntime_functions.pxi
-cuda_bindings/cuda/bindings/cyruntime_types.pxi
-cuda_bindings/cuda/bindings/runtime.pxd
-cuda_bindings/cuda/bindings/runtime.pyx
+cuda_bindings/cuda/bindings/_internal/runtime.pyx
+cuda_bindings/cuda/bindings/_internal/runtime_ptds.pyx
 cuda_bindings/cuda/bindings/utils/_get_handle.pyx
 
 # Version files from setuptools_scm

diff --git a/cuda_bindings/AGENTS.md b/cuda_bindings/AGENTS.md
@@ -18,21 +18,18 @@ subpackage in the `cuda-python` monorepo.
   glue and loader helpers used by public modules.
 - **Platform internals**: `cuda/bindings/_internal/` contains
   platform-specific implementation files and support code.
-- **Build/codegen backend**: `build_hooks.py` drives header parsing, template
-  expansion, extension configuration, and Cythonization.
+- **Build backend**: `build_hooks.py` drives extension configuration and
+  Cythonization.
 
 ## Generated-source workflow
 
 - **Do not hand-edit generated binding files**: many files under
-  `cuda/bindings/` (including `*.pyx`, `*.pxd`, `*.pyx.in`, and `*.pxd.in`)
-  are generated artifacts.
+  `cuda/bindings/` (including `*.pyx` and `*.pxd`) are generated artifacts.
 - **Generated files are synchronized from another repository**: changes to these
   files in this repo are expected to be overwritten by the next sync.
 - **If generated output must change**: make the change at the generation source
   and sync the updated artifacts back here, rather than patching generated files
   directly in this repo.
-- **Header-driven generation**: parser behavior and required CUDA headers are
-  defined in `build_hooks.py`; update those rules when introducing new symbols.
 - **Platform split files**: keep `_linux.pyx` and `_windows.pyx` variants
   aligned when behavior should be equivalent.
 
@@ -49,9 +46,8 @@ subpackage in the `cuda-python` monorepo.
 ## Build and environment notes
 
 - `CUDA_HOME` or `CUDA_PATH` must point to a valid CUDA Toolkit for source
-  builds that parse headers.
+  builds.
 - `CUDA_PYTHON_PARALLEL_LEVEL` controls build parallelism.
-- `CUDA_PYTHON_PARSER_CACHING` controls parser-cache behavior during generation.
 - Runtime behavior is affected by
   `CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM` and
   `CUDA_PYTHON_DISABLE_MAJOR_VERSION_WARNING`.

diff --git a/cuda_bindings/build_hooks.py b/cuda_bindings/build_hooks.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 # This module implements basic PEP 517 backend support to defer CUDA-dependent
-# logic (header parsing, code generation, cythonization) to build time. See:
+# logic (cythonization) to build time. See:
 # - https://peps.python.org/pep-0517/
 # - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks
 # - https://github.com/NVIDIA/cuda-python/issues/1635
@@ -74,207 +74,6 @@ def _get_cuda_path() -> str:
     return cuda_path
 
 
-# -----------------------------------------------------------------------
-# Header parsing helpers (called only from _build_cuda_bindings)
-
-_REQUIRED_HEADERS = {
-    "runtime": [
-        "driver_types.h",
-        "vector_types.h",
-        "cuda_runtime.h",
-        "surface_types.h",
-        "texture_types.h",
-        "library_types.h",
-        "cuda_runtime_api.h",
-        "device_types.h",
-        "driver_functions.h",
-        "cuda_profiler_api.h",
-    ],
-    # nvrtc: headers no longer parsed at build time (pre-generated by cybind).
-    # During compilation, Cython will reference C headers that are not
-    # explicitly parsed above. These are the known dependencies:
-    #
-    # - crt/host_defines.h
-    # - builtin_types.h
-    # - cuda_device_runtime_api.h
-}
-
-
-class _Struct:
-    def __init__(self, name, members):
-        self._name = name
-        self._member_names = []
-        self._member_types = []
-        self._member_declarators = []
-        for var_name, var_type, _ in members:
-            base_type = var_type[0]
-            base_type = base_type.removeprefix("struct ")
-            base_type = base_type.removeprefix("union ")
-
-            self._member_names += [var_name]
-            self._member_types += [base_type]
-            self._member_declarators += [tuple(var_type[1:])]
-
-    def member_type(self, member_name):
-        try:
-            return self._member_types[self._member_names.index(member_name)]
-        except ValueError:
-            return None
-
-    def member_array_length(self, member_name):
-        try:
-            declarators = self._member_declarators[self._member_names.index(member_name)]
-        except ValueError:
-            return None
-
-        for declarator in declarators:
-            if isinstance(declarator, list) and len(declarator) == 1:
-                return declarator[0]
-        return None
-
-    def discoverMembers(self, memberDict, prefix, seen=None):
-        if seen is None:
-            seen = set()
-        elif self._name in seen:
-            return []
-
-        discovered = []
-        next_seen = set(seen)
-        next_seen.add(self._name)
-
-        for memberName, memberType in zip(self._member_names, self._member_types):
-            if memberName:
-                discovered.append(".".join([prefix, memberName]))
-
-            t = memberType.replace("const ", "").replace("volatile ", "").strip().rstrip(" *")
-            if t in memberDict and t != self._name:
-                discovered += memberDict[t].discoverMembers(
-                    memberDict, discovered[-1] if memberName else prefix, next_seen
-                )
-
-        return discovered
-
-    def __repr__(self):
-        return f"{self._name}: {self._member_names} with types {self._member_types}"
-
-
-def _fetch_header_paths(required_headers, include_path_list):
-    header_dict = {}
-    missing_headers = []
-    for library, header_list in required_headers.items():
-        header_paths = []
-        for header in header_list:
-            path_candidate = [os.path.join(path, header) for path in include_path_list]
-            for path in path_candidate:
-                if os.path.exists(path):
-                    header_paths += [path]
-                    break
-            else:
-                missing_headers += [header]
-
-        header_dict[library] = header_paths
-
-    if missing_headers:
-        error_message = "Couldn't find required headers: "
-        error_message += ", ".join(missing_headers)
-        cuda_path = _get_cuda_path()
-        raise RuntimeError(f'{error_message}\nIs CUDA_PATH setup correctly? (CUDA_PATH="{cuda_path}")')
-
-    return header_dict
-
-
-def _parse_headers(header_dict, include_path_list, parser_caching):
-    from pyclibrary import CParser
-
-    found_types = []
-    found_functions = []
-    found_values = []
-    found_struct = []
-    struct_list = {}
-
-    replace = {
-        " __device_builtin__ ": " ",
-        "CUDARTAPI ": " ",
-        "typedef __device_builtin__ enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
-        "typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
-        "typedef enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
-        "typedef enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
-        "typedef enum cudaDataType_t cudaDataType_t;": "",
-        "typedef enum libraryPropertyType_t libraryPropertyType_t;": "",
-        "  enum ": "   ",
-        ", enum ": ", ",
-        "\\(enum ": "(",
-        # Since we only support 64 bit architectures, we can inline the sizeof(T*) to 8 and then compute the
-        # result in Python. The arithmetic expression is preserved to help with clarity and understanding
-        r"char reserved\[52 - sizeof\(CUcheckpointGpuPair \*\)\];": rf"char reserved[{52 - 8}];",
-        r"char reserved\[64 - sizeof\(CUcheckpointGpuPair \*\) - sizeof\(unsigned int\)\];": rf"char reserved[{64 - 8 - 4}];",
-    }
-
-    print(f'Parsing headers in "{include_path_list}" (Caching = {parser_caching})', flush=True)
-    for library, header_paths in header_dict.items():
-        print(f"Parsing {library} headers", flush=True)
-        parser = CParser(
-            header_paths, cache="./cache_{}".format(library.split(".")[0]) if parser_caching else None, replace=replace
-        )
-
-        if library == "driver":
-            CUDA_VERSION = parser.defs["macros"].get("CUDA_VERSION", "Unknown")
-            print(f"Found CUDA_VERSION: {CUDA_VERSION}", flush=True)
-
-        found_types += set(parser.defs["types"])
-        found_types += set(parser.defs["structs"])
-        found_types += set(parser.defs["unions"])
-        found_types += set(parser.defs["enums"])
-        found_functions += set(parser.defs["functions"])
-        found_values += set(parser.defs["values"])
-
-        for key, value in parser.defs["structs"].items():
-            struct_list[key] = _Struct(key, value["members"])
-        for key, value in parser.defs["unions"].items():
-            struct_list[key] = _Struct(key, value["members"])
-
-        for key, value in struct_list.items():
-            if key.startswith(("anon_union", "anon_struct")):
-                continue
-
-            found_struct += [key]
-            discovered = value.discoverMembers(struct_list, key)
-            if discovered:
-                found_struct += discovered
-
-    # TODO(#1312): make this work properly
-    found_types.append("CUstreamAtomicReductionDataType_enum")
-
-    return found_types, found_functions, found_values, found_struct, struct_list
-
-
-# -----------------------------------------------------------------------
-# Code generation helpers
-
-
-def _fetch_input_files(path):
-    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".in")]
-
-
-def _generate_output(infile, template_vars):
-    from Cython import Tempita
-
-    assert infile.endswith(".in")
-    outfile = infile[:-3]
-
-    with open(infile, encoding="utf-8") as f:
-        pxdcontent = Tempita.Template(f.read()).substitute(template_vars)
-
-    if os.path.exists(outfile):
-        with open(outfile, encoding="utf-8") as f:
-            if f.read() == pxdcontent:
-                print(f"Skipping {infile} (No change)", flush=True)
-                return
-    with open(outfile, "w", encoding="utf-8") as f:
-        print(f"Generating {infile}", flush=True)
-        f.write(pxdcontent)
-
-
 # -----------------------------------------------------------------------
 # Extension preparation helpers
 
@@ -328,9 +127,8 @@ def _prep_extensions(sources, libraries, include_dirs, library_dirs, extra_compi
 def _build_cuda_bindings(debug=False):
     """Build all cuda-bindings extensions.
 
-    All CUDA-dependent logic (header parsing, code generation, cythonization)
-    is deferred to this function so that metadata queries do not require a
-    CUDA toolkit installation.
+    All CUDA-dependent logic (cythonization) is deferred to this function so
+    that metadata queries do not require a CUDA toolkit installation.
     """
     from Cython.Build import cythonize
 
@@ -348,54 +146,10 @@ def _build_cuda_bindings(debug=False):
     else:
         nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", "0") or "0")
 
-    parser_caching = bool(os.environ.get("CUDA_PYTHON_PARSER_CACHING", False))
     compile_for_coverage = bool(int(os.environ.get("CUDA_PYTHON_COVERAGE", "0")))
 
-    # Parse CUDA headers
-    include_path_list = [os.path.join(cuda_path, "include")]
-    header_dict = _fetch_header_paths(_REQUIRED_HEADERS, include_path_list)
-    found_types, found_functions, found_values, found_struct, struct_list = _parse_headers(
-        header_dict, include_path_list, parser_caching
-    )
-    struct_field_types = {}
-    struct_field_array_lengths = {}
-    for struct_name, struct in struct_list.items():
-        for member_name in struct._member_names:
-            key = f"{struct_name}.{member_name}"
-            struct_field_types[key] = struct.member_type(member_name)
-            struct_field_array_lengths[key] = struct.member_array_length(member_name)
-
-    # Generate code from .in templates
-    path_list = [
-        os.path.join("cuda"),
-        os.path.join("cuda", "bindings"),
-        os.path.join("cuda", "bindings", "_bindings"),
-        os.path.join("cuda", "bindings", "_internal"),
-        os.path.join("cuda", "bindings", "_lib"),
-        os.path.join("cuda", "bindings", "utils"),
-    ]
-    input_files = []
-    for path in path_list:
-        input_files += _fetch_input_files(path)
-
-    import platform
-
-    template_vars = {
-        "found_types": found_types,
-        "found_functions": found_functions,
-        "found_values": found_values,
-        "found_struct": found_struct,
-        "struct_list": struct_list,
-        "struct_field_types": struct_field_types,
-        "struct_field_array_lengths": struct_field_array_lengths,
-        "os": os,
-        "sys": sys,
-        "platform": platform,
-    }
-    for file in input_files:
-        _generate_output(file, template_vars)
-
     # Prepare compile/link arguments
+    include_path_list = [os.path.join(cuda_path, "include")]
     include_dirs = [
         os.path.dirname(sysconfig.get_path("include")),
     ] + include_path_list
@@ -439,21 +193,26 @@ def _cleanup_dst_files():
 
     # Build extension list
     extensions = []
-    static_runtime_libraries = ["cudart_static", "rt"] if sys.platform == "linux" else ["cudart_static"]
     cuda_bindings_files = glob.glob("cuda/bindings/*.pyx")
     if sys.platform == "win32":
         cuda_bindings_files = [f for f in cuda_bindings_files if "cufile" not in f]
+
+    def get_static_libraries(f):
+        if os.path.basename(f) in ("runtime.pyx", "runtime_ptds.pyx"):
+            if sys.platform == "linux":
+                return ["cudart_static", "rt"]
+            else:
+                return ["cudart_static"]
+        return None
+
     sources_list = [
-        # private
-        (["cuda/bindings/_bindings/cyruntime.pyx"], static_runtime_libraries),
-        (["cuda/bindings/_bindings/cyruntime_ptds.pyx"], static_runtime_libraries),
         # utils
         (["cuda/bindings/utils/*.pyx"], None),
         # public
         *(([f], None) for f in cuda_bindings_files),
         # internal files used by generated bindings
         (["cuda/bindings/_internal/utils.pyx"], None),
-        *(([f], None) for f in dst_files if f.endswith(".pyx")),
+        *(([f], get_static_libraries(f)) for f in dst_files if f.endswith(".pyx")),
     ]
 
     for sources, libraries in sources_list:

diff --git a/cuda_bindings/cuda/bindings/_bindings/__init__.py b/cuda_bindings/cuda/bindings/_bindings/__init__.py