Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,16 @@ cache_runtime
cache_nvrtc

# CUDA Python specific (auto-generated)
cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd
cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx
cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd
cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx
cuda_bindings/cuda/bindings/_internal/cudla.pyx
cuda_bindings/cuda/bindings/_internal/driver.pyx
cuda_bindings/cuda/bindings/_internal/nvrtc.pyx
cuda_bindings/cuda/bindings/_internal/cufile.pyx
cuda_bindings/cuda/bindings/_internal/nvfatbin.pyx
cuda_bindings/cuda/bindings/_internal/nvjitlink.pyx
cuda_bindings/cuda/bindings/_internal/nvml.pyx
cuda_bindings/cuda/bindings/_internal/nvvm.pyx
cuda_bindings/cuda/bindings/cyruntime.pxd
cuda_bindings/cuda/bindings/cyruntime.pyx
cuda_bindings/cuda/bindings/cyruntime_functions.pxi
cuda_bindings/cuda/bindings/cyruntime_types.pxi
cuda_bindings/cuda/bindings/runtime.pxd
cuda_bindings/cuda/bindings/runtime.pyx
cuda_bindings/cuda/bindings/_internal/runtime.pyx
cuda_bindings/cuda/bindings/_internal/runtime_ptds.pyx
cuda_bindings/cuda/bindings/utils/_get_handle.pyx

# Version files from setuptools_scm
Expand Down
12 changes: 4 additions & 8 deletions cuda_bindings/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,18 @@ subpackage in the `cuda-python` monorepo.
glue and loader helpers used by public modules.
- **Platform internals**: `cuda/bindings/_internal/` contains
platform-specific implementation files and support code.
- **Build/codegen backend**: `build_hooks.py` drives header parsing, template
expansion, extension configuration, and Cythonization.
- **Build backend**: `build_hooks.py` drives extension configuration and
Cythonization.

## Generated-source workflow

- **Do not hand-edit generated binding files**: many files under
`cuda/bindings/` (including `*.pyx`, `*.pxd`, `*.pyx.in`, and `*.pxd.in`)
are generated artifacts.
`cuda/bindings/` (including `*.pyx` and `*.pxd`) are generated artifacts.
- **Generated files are synchronized from another repository**: changes to these
files in this repo are expected to be overwritten by the next sync.
- **If generated output must change**: make the change at the generation source
and sync the updated artifacts back here, rather than patching generated files
directly in this repo.
- **Header-driven generation**: parser behavior and required CUDA headers are
defined in `build_hooks.py`; update those rules when introducing new symbols.
- **Platform split files**: keep `_linux.pyx` and `_windows.pyx` variants
aligned when behavior should be equivalent.

Expand All @@ -49,9 +46,8 @@ subpackage in the `cuda-python` monorepo.
## Build and environment notes

- `CUDA_HOME` or `CUDA_PATH` must point to a valid CUDA Toolkit for source
builds that parse headers.
builds.
- `CUDA_PYTHON_PARALLEL_LEVEL` controls build parallelism.
- `CUDA_PYTHON_PARSER_CACHING` controls parser-cache behavior during generation.
- Runtime behavior is affected by
`CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM` and
`CUDA_PYTHON_DISABLE_MAJOR_VERSION_WARNING`.
Expand Down
269 changes: 14 additions & 255 deletions cuda_bindings/build_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

# This module implements basic PEP 517 backend support to defer CUDA-dependent
# logic (header parsing, code generation, cythonization) to build time. See:
# logic (cythonization) to build time. See:
# - https://peps.python.org/pep-0517/
# - https://setuptools.pypa.io/en/latest/build_meta.html#dynamic-build-dependencies-and-other-build-meta-tweaks
# - https://github.com/NVIDIA/cuda-python/issues/1635
Expand Down Expand Up @@ -74,207 +74,6 @@ def _get_cuda_path() -> str:
return cuda_path


# -----------------------------------------------------------------------
# Header parsing helpers (called only from _build_cuda_bindings)

_REQUIRED_HEADERS = {
"runtime": [
"driver_types.h",
"vector_types.h",
"cuda_runtime.h",
"surface_types.h",
"texture_types.h",
"library_types.h",
"cuda_runtime_api.h",
"device_types.h",
"driver_functions.h",
"cuda_profiler_api.h",
],
# nvrtc: headers no longer parsed at build time (pre-generated by cybind).
# During compilation, Cython will reference C headers that are not
# explicitly parsed above. These are the known dependencies:
#
# - crt/host_defines.h
# - builtin_types.h
# - cuda_device_runtime_api.h
}


class _Struct:
def __init__(self, name, members):
self._name = name
self._member_names = []
self._member_types = []
self._member_declarators = []
for var_name, var_type, _ in members:
base_type = var_type[0]
base_type = base_type.removeprefix("struct ")
base_type = base_type.removeprefix("union ")

self._member_names += [var_name]
self._member_types += [base_type]
self._member_declarators += [tuple(var_type[1:])]

def member_type(self, member_name):
try:
return self._member_types[self._member_names.index(member_name)]
except ValueError:
return None

def member_array_length(self, member_name):
try:
declarators = self._member_declarators[self._member_names.index(member_name)]
except ValueError:
return None

for declarator in declarators:
if isinstance(declarator, list) and len(declarator) == 1:
return declarator[0]
return None

def discoverMembers(self, memberDict, prefix, seen=None):
if seen is None:
seen = set()
elif self._name in seen:
return []

discovered = []
next_seen = set(seen)
next_seen.add(self._name)

for memberName, memberType in zip(self._member_names, self._member_types):
if memberName:
discovered.append(".".join([prefix, memberName]))

t = memberType.replace("const ", "").replace("volatile ", "").strip().rstrip(" *")
if t in memberDict and t != self._name:
discovered += memberDict[t].discoverMembers(
memberDict, discovered[-1] if memberName else prefix, next_seen
)

return discovered

def __repr__(self):
return f"{self._name}: {self._member_names} with types {self._member_types}"


def _fetch_header_paths(required_headers, include_path_list):
header_dict = {}
missing_headers = []
for library, header_list in required_headers.items():
header_paths = []
for header in header_list:
path_candidate = [os.path.join(path, header) for path in include_path_list]
for path in path_candidate:
if os.path.exists(path):
header_paths += [path]
break
else:
missing_headers += [header]

header_dict[library] = header_paths

if missing_headers:
error_message = "Couldn't find required headers: "
error_message += ", ".join(missing_headers)
cuda_path = _get_cuda_path()
raise RuntimeError(f'{error_message}\nIs CUDA_PATH setup correctly? (CUDA_PATH="{cuda_path}")')

return header_dict


def _parse_headers(header_dict, include_path_list, parser_caching):
from pyclibrary import CParser

found_types = []
found_functions = []
found_values = []
found_struct = []
struct_list = {}

replace = {
" __device_builtin__ ": " ",
"CUDARTAPI ": " ",
"typedef __device_builtin__ enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
"typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
"typedef enum cudaError cudaError_t;": "typedef cudaError cudaError_t;",
"typedef enum cudaOutputMode cudaOutputMode_t;": "typedef cudaOutputMode cudaOutputMode_t;",
"typedef enum cudaDataType_t cudaDataType_t;": "",
"typedef enum libraryPropertyType_t libraryPropertyType_t;": "",
" enum ": " ",
", enum ": ", ",
"\\(enum ": "(",
# Since we only support 64 bit architectures, we can inline the sizeof(T*) to 8 and then compute the
# result in Python. The arithmetic expression is preserved to help with clarity and understanding
r"char reserved\[52 - sizeof\(CUcheckpointGpuPair \*\)\];": rf"char reserved[{52 - 8}];",
r"char reserved\[64 - sizeof\(CUcheckpointGpuPair \*\) - sizeof\(unsigned int\)\];": rf"char reserved[{64 - 8 - 4}];",
}

print(f'Parsing headers in "{include_path_list}" (Caching = {parser_caching})', flush=True)
for library, header_paths in header_dict.items():
print(f"Parsing {library} headers", flush=True)
parser = CParser(
header_paths, cache="./cache_{}".format(library.split(".")[0]) if parser_caching else None, replace=replace
)

if library == "driver":
CUDA_VERSION = parser.defs["macros"].get("CUDA_VERSION", "Unknown")
print(f"Found CUDA_VERSION: {CUDA_VERSION}", flush=True)

found_types += set(parser.defs["types"])
found_types += set(parser.defs["structs"])
found_types += set(parser.defs["unions"])
found_types += set(parser.defs["enums"])
found_functions += set(parser.defs["functions"])
found_values += set(parser.defs["values"])

for key, value in parser.defs["structs"].items():
struct_list[key] = _Struct(key, value["members"])
for key, value in parser.defs["unions"].items():
struct_list[key] = _Struct(key, value["members"])

for key, value in struct_list.items():
if key.startswith(("anon_union", "anon_struct")):
continue

found_struct += [key]
discovered = value.discoverMembers(struct_list, key)
if discovered:
found_struct += discovered

# TODO(#1312): make this work properly
found_types.append("CUstreamAtomicReductionDataType_enum")

return found_types, found_functions, found_values, found_struct, struct_list


# -----------------------------------------------------------------------
# Code generation helpers


def _fetch_input_files(path):
return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".in")]


def _generate_output(infile, template_vars):
from Cython import Tempita

assert infile.endswith(".in")
outfile = infile[:-3]

with open(infile, encoding="utf-8") as f:
pxdcontent = Tempita.Template(f.read()).substitute(template_vars)

if os.path.exists(outfile):
with open(outfile, encoding="utf-8") as f:
if f.read() == pxdcontent:
print(f"Skipping {infile} (No change)", flush=True)
return
with open(outfile, "w", encoding="utf-8") as f:
print(f"Generating {infile}", flush=True)
f.write(pxdcontent)


# -----------------------------------------------------------------------
# Extension preparation helpers

Expand Down Expand Up @@ -328,9 +127,8 @@ def _prep_extensions(sources, libraries, include_dirs, library_dirs, extra_compi
def _build_cuda_bindings(debug=False):
"""Build all cuda-bindings extensions.
All CUDA-dependent logic (header parsing, code generation, cythonization)
is deferred to this function so that metadata queries do not require a
CUDA toolkit installation.
All CUDA-dependent logic (cythonization) is deferred to this function so
that metadata queries do not require a CUDA toolkit installation.
"""
from Cython.Build import cythonize

Expand All @@ -348,54 +146,10 @@ def _build_cuda_bindings(debug=False):
else:
nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", "0") or "0")

parser_caching = bool(os.environ.get("CUDA_PYTHON_PARSER_CACHING", False))
compile_for_coverage = bool(int(os.environ.get("CUDA_PYTHON_COVERAGE", "0")))

# Parse CUDA headers
include_path_list = [os.path.join(cuda_path, "include")]
header_dict = _fetch_header_paths(_REQUIRED_HEADERS, include_path_list)
found_types, found_functions, found_values, found_struct, struct_list = _parse_headers(
header_dict, include_path_list, parser_caching
)
struct_field_types = {}
struct_field_array_lengths = {}
for struct_name, struct in struct_list.items():
for member_name in struct._member_names:
key = f"{struct_name}.{member_name}"
struct_field_types[key] = struct.member_type(member_name)
struct_field_array_lengths[key] = struct.member_array_length(member_name)

# Generate code from .in templates
path_list = [
os.path.join("cuda"),
os.path.join("cuda", "bindings"),
os.path.join("cuda", "bindings", "_bindings"),
os.path.join("cuda", "bindings", "_internal"),
os.path.join("cuda", "bindings", "_lib"),
os.path.join("cuda", "bindings", "utils"),
]
input_files = []
for path in path_list:
input_files += _fetch_input_files(path)

import platform

template_vars = {
"found_types": found_types,
"found_functions": found_functions,
"found_values": found_values,
"found_struct": found_struct,
"struct_list": struct_list,
"struct_field_types": struct_field_types,
"struct_field_array_lengths": struct_field_array_lengths,
"os": os,
"sys": sys,
"platform": platform,
}
for file in input_files:
_generate_output(file, template_vars)

# Prepare compile/link arguments
include_path_list = [os.path.join(cuda_path, "include")]
include_dirs = [
os.path.dirname(sysconfig.get_path("include")),
] + include_path_list
Expand Down Expand Up @@ -439,21 +193,26 @@ def _cleanup_dst_files():

# Build extension list
extensions = []
static_runtime_libraries = ["cudart_static", "rt"] if sys.platform == "linux" else ["cudart_static"]
cuda_bindings_files = glob.glob("cuda/bindings/*.pyx")
if sys.platform == "win32":
cuda_bindings_files = [f for f in cuda_bindings_files if "cufile" not in f]

def get_static_libraries(f):
if os.path.basename(f) in ("runtime.pyx", "runtime_ptds.pyx"):
if sys.platform == "linux":
return ["cudart_static", "rt"]
else:
return ["cudart_static"]
return None

sources_list = [
# private
(["cuda/bindings/_bindings/cyruntime.pyx"], static_runtime_libraries),
(["cuda/bindings/_bindings/cyruntime_ptds.pyx"], static_runtime_libraries),
# utils
(["cuda/bindings/utils/*.pyx"], None),
# public
*(([f], None) for f in cuda_bindings_files),
# internal files used by generated bindings
(["cuda/bindings/_internal/utils.pyx"], None),
*(([f], None) for f in dst_files if f.endswith(".pyx")),
*(([f], get_static_libraries(f)) for f in dst_files if f.endswith(".pyx")),
]

for sources, libraries in sources_list:
Expand Down
Empty file.
Loading