From 0b63c0010259412f183b0d0d8eb1ce899a927918 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 28 Apr 2026 09:44:15 -0700 Subject: [PATCH 1/9] cuda.core: convert GraphBuilder to cdef class with explicit state machine Refactor GraphBuilder from a Python class using _MembersNeededForFinalize to a cdef class with explicit _BuilderKind (PRIMARY/FORKED/CONDITIONAL_BODY) and _CaptureState (NOT_STARTED/CAPTURING/ENDED) tracking. Cleanup moves into __dealloc__/close, and the builder now uses GraphHandle/StreamHandle from _resource_handles instead of holding raw driver objects. Drop the is_stream_owner flag now that StreamHandle owns the lifetime. End-capture paths in __dealloc__ and close guard on _h_stream so cleanup is safe even if _init* fails before completing assignment. Made-with: Cursor --- cuda_core/cuda/core/_device.pyx | 5 +- cuda_core/cuda/core/_stream.pyx | 7 +- cuda_core/cuda/core/graph/_graph_builder.pxd | 19 ++ cuda_core/cuda/core/graph/_graph_builder.pyx | 331 +++++++++++-------- 4 files changed, 208 insertions(+), 154 deletions(-) create mode 100644 cuda_core/cuda/core/graph/_graph_builder.pxd diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index c0d7f09ee44..d9776a72e8c 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -14,6 +14,7 @@ import threading from cuda.core._context cimport Context from cuda.core._context import ContextOptions from cuda.core._event cimport Event as cyEvent +from cuda.core.graph._graph_builder cimport GraphBuilder from cuda.core._event import Event, EventOptions from cuda.core._memory._buffer cimport Buffer, MemoryResource from cuda.core._resource_handles cimport ( @@ -1370,10 +1371,8 @@ class Device: Newly created graph builder object. """ - from cuda.core.graph._graph_builder import GraphBuilder - self._check_context_initialized() - return GraphBuilder._init(stream=self.create_stream(), is_stream_owner=True) + return GraphBuilder._init(self.create_stream()) cdef inline int Device_ensure_cuda_initialized() except? -1: diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index fdb617f0325..c7b1312c17a 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -10,6 +10,7 @@ from libc.stdlib cimport strtol, getenv from cuda.bindings cimport cydriver from cuda.core._event cimport Event as cyEvent +from cuda.core.graph._graph_builder cimport GraphBuilder from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, @@ -371,9 +372,7 @@ cdef class Stream: Newly created graph builder object. """ - from cuda.core.graph._graph_builder import GraphBuilder - - return GraphBuilder._init(stream=self, is_stream_owner=False) + return GraphBuilder._init(self) # c-only python objects, not public @@ -474,8 +473,6 @@ cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*: # Helper for API functions that accept either Stream or GraphBuilder. Performs # needed checks and returns the relevant stream. cdef Stream Stream_accept(arg, bint allow_stream_protocol=False): - from cuda.core.graph._graph_builder import GraphBuilder - if isinstance(arg, Stream): return (arg) elif isinstance(arg, GraphBuilder): diff --git a/cuda_core/cuda/core/graph/_graph_builder.pxd b/cuda_core/cuda/core/graph/_graph_builder.pxd new file mode 100644 index 00000000000..e224f3a5109 --- /dev/null +++ b/cuda_core/cuda/core/graph/_graph_builder.pxd @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core._resource_handles cimport GraphHandle, StreamHandle +from cuda.core._stream cimport Stream + + +cdef class GraphBuilder: + cdef: + GraphHandle _h_graph + StreamHandle _h_stream + int _kind + int _state + Stream _stream # cached to avoid reconstruction from _h_stream handle + object __weakref__ + + @staticmethod + cdef GraphBuilder _init(Stream stream) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index 526c95e04ad..61629dcc2e5 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -11,7 +11,10 @@ from cuda.bindings cimport cydriver from cuda.core.graph._graph_definition cimport GraphCondition from cuda.core.graph._utils cimport _attach_host_callback_to_graph -from cuda.core._resource_handles cimport as_cu +from cuda.core._resource_handles cimport ( + GraphHandle, StreamHandle, as_cu, as_py, + create_graph_handle, create_graph_handle_ref, +) from cuda.core._stream cimport Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.version cimport cy_binding_version, cy_driver_version @@ -185,7 +188,40 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> return graph -class GraphBuilder: +# Distinguishes the three kinds of GraphBuilder, which differ in how they +# begin/end stream capture and whether they own the resulting CUgraph. +# Each kind progresses through _CaptureState as follows: +# +# PRIMARY: NOT_STARTED -> CAPTURING -> ENDED +# FORKED: CAPTURING (never transitions; joined and closed) +# CONDITIONAL_BODY: NOT_STARTED -> CAPTURING -> ENDED +# +cdef enum _BuilderKind: + # PRIMARY: The top-level builder created by Device or Stream. Owns the + # captured CUgraph via an owning GraphHandle. Progresses through all three + # capture states; responsible for ending capture if destroyed early. + PRIMARY = 0 + # FORKED: Created by split(). Captures on a private stream forked from the + # primary. Starts in CAPTURING state and never transitions; the user joins + # it back to the primary via join(), which closes the builder. Must NOT + # call cuStreamEndCapture (the driver requires all forked streams to be + # joined first). + FORKED = 1 + # CONDITIONAL_BODY: Created by if_then/if_else/switch/while_loop. Captures + # into a non-owned body graph via cuStreamBeginCaptureToGraph. The body + # graph's lifetime is tied to a parent graph. Progresses through all three + # capture states like PRIMARY. + CONDITIONAL_BODY = 2 + + +# Tracks the capture lifecycle of a GraphBuilder. +cdef enum _CaptureState: + CAPTURE_NOT_STARTED = 0 + CAPTURING = 1 + CAPTURE_ENDED = 2 + + +cdef class GraphBuilder: """A graph under construction by stream capture. A graph groups a set of CUDA kernels and other CUDA operations together and executes @@ -198,63 +234,48 @@ class GraphBuilder: """ - class _MembersNeededForFinalize: - __slots__ = ("conditional_graph", "graph", "is_join_required", "is_stream_owner", "stream") - - def __init__(self, graph_builder_obj, stream_obj, is_stream_owner, conditional_graph, is_join_required): - self.stream = stream_obj - self.is_stream_owner = is_stream_owner - self.graph = None - self.conditional_graph = conditional_graph - self.is_join_required = is_join_required - weakref.finalize(graph_builder_obj, self.close) - - def close(self): - if self.stream: - if not self.is_join_required: - capture_status = handle_return(driver.cuStreamGetCaptureInfo(self.stream.handle))[0] - if capture_status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE: - # Note how this condition only occures for the primary graph builder - # This is because calling cuStreamEndCapture streams that were split off of the primary - # would error out with CUDA_ERROR_STREAM_CAPTURE_UNJOINED. - # Therefore, it is currently a requirement that users join all split graph builders - # before a graph builder can be clearly destroyed. - handle_return(driver.cuStreamEndCapture(self.stream.handle)) - if self.is_stream_owner: - self.stream.close() - self.stream = None - if self.graph: - handle_return(driver.cuGraphDestroy(self.graph)) - self.graph = None - self.conditional_graph = None - - __slots__ = ("__weakref__", "_building_ended", "_mnff") - def __init__(self): raise NotImplementedError( "directly creating a Graph object can be ambiguous. Please either " "call Device.create_graph_builder() or stream.create_graph_builder()" ) - @classmethod - def _init(cls, stream, is_stream_owner, conditional_graph=None, is_join_required=False): - self = cls.__new__(cls) - self._mnff = GraphBuilder._MembersNeededForFinalize( - self, stream, is_stream_owner, conditional_graph, is_join_required - ) + def __dealloc__(self): + # Note: _stream could be set to None by cyclic-GC tp_clear before + # __dealloc__, but _h_stream is guaranteed to be valid. + if self._h_stream and self._state == CAPTURING and self._kind != FORKED: + with nogil: + cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL) - self._building_ended = False + @staticmethod + cdef GraphBuilder _init(Stream stream): + cdef GraphBuilder self = GraphBuilder.__new__(GraphBuilder) + # _h_graph set by begin_building + self._h_stream = stream._h_stream + self._kind = PRIMARY + self._state = CAPTURE_NOT_STARTED + self._stream = stream return self + def close(self): + """Destroy the graph builder.""" + if self._h_stream and self._state == CAPTURING and self._kind != FORKED: + with nogil: + HANDLE_RETURN(cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL)) + self._h_graph.reset() + self._h_stream.reset() + self._state = CAPTURE_ENDED + self._stream = None + @property def stream(self) -> Stream: """Returns the stream associated with the graph builder.""" - return self._mnff.stream + return self._stream @property def is_join_required(self) -> bool: """Returns True if this graph builder must be joined before building is ended.""" - return self._mnff.is_join_required + return self._kind == FORKED def begin_building(self, mode="relaxed") -> GraphBuilder: """Begins the building process. @@ -272,61 +293,65 @@ class GraphBuilder: Default set to use relaxed. """ - if self._building_ended: - raise RuntimeError("Cannot resume building after building has ended.") - if mode not in ("global", "thread_local", "relaxed"): - raise ValueError(f"Unsupported build mode: {mode}") + if self._state != CAPTURE_NOT_STARTED: + if self._state == CAPTURING: + raise RuntimeError("Graph builder is already building.") + else: + raise RuntimeError("Cannot resume building after building has ended.") + cdef cydriver.CUstreamCaptureMode c_mode if mode == "global": - capture_mode = driver.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_GLOBAL + c_mode = cydriver.CU_STREAM_CAPTURE_MODE_GLOBAL elif mode == "thread_local": - capture_mode = driver.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_THREAD_LOCAL + c_mode = cydriver.CU_STREAM_CAPTURE_MODE_THREAD_LOCAL elif mode == "relaxed": - capture_mode = driver.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED + c_mode = cydriver.CU_STREAM_CAPTURE_MODE_RELAXED else: raise ValueError(f"Unsupported build mode: {mode}") - if self._mnff.conditional_graph: - handle_return( - driver.cuStreamBeginCaptureToGraph( - self._mnff.stream.handle, - self._mnff.conditional_graph, - None, # dependencies - None, # dependencyData - 0, # numDependencies - capture_mode, - ) - ) + cdef cydriver.CUstream c_stream = as_cu(self._h_stream) + cdef cydriver.CUgraph c_graph + if self._kind == CONDITIONAL_BODY: + c_graph = as_cu(self._h_graph) + with nogil: + HANDLE_RETURN(cydriver.cuStreamBeginCaptureToGraph( + c_stream, c_graph, NULL, NULL, 0, c_mode)) else: - handle_return(driver.cuStreamBeginCapture(self._mnff.stream.handle, capture_mode)) + with nogil: + HANDLE_RETURN(cydriver.cuStreamBeginCapture(c_stream, c_mode)) + _get_capture_info(c_stream, NULL, &c_graph) + self._h_graph = create_graph_handle(c_graph) + self._state = CAPTURING return self @property def is_building(self) -> bool: """Returns True if the graph builder is currently building.""" - capture_status = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle))[0] - if capture_status == driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE: + cdef cydriver.CUstream c_stream = as_cu(self._h_stream) + cdef cydriver.CUstreamCaptureStatus status + with nogil: + _get_capture_info(c_stream, &status, NULL) + if status == cydriver.CU_STREAM_CAPTURE_STATUS_NONE: return False - elif capture_status == driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: + elif status == cydriver.CU_STREAM_CAPTURE_STATUS_ACTIVE: return True - elif capture_status == driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_INVALIDATED: + elif status == cydriver.CU_STREAM_CAPTURE_STATUS_INVALIDATED: raise RuntimeError( "Build process encountered an error and has been invalidated. Build process must now be ended." ) else: - raise NotImplementedError(f"Unsupported capture status type received: {capture_status}") + raise NotImplementedError(f"Unsupported capture status type received: {status}") def end_building(self) -> GraphBuilder: """Ends the building process.""" if not self.is_building: raise RuntimeError("Graph builder is not building.") - if self._mnff.conditional_graph: - self._mnff.conditional_graph = handle_return(driver.cuStreamEndCapture(self.stream.handle)) - else: - self._mnff.graph = handle_return(driver.cuStreamEndCapture(self.stream.handle)) + cdef cydriver.CUstream c_stream = as_cu(self._h_stream) + with nogil: + HANDLE_RETURN(cydriver.cuStreamEndCapture(c_stream, NULL)) # TODO: Resolving https://github.com/NVIDIA/cuda-python/issues/617 would allow us to # resume the build process after the first call to end_building() - self._building_ended = True + self._state = CAPTURE_ENDED return self def complete(self, options: GraphCompleteOptions | None = None) -> "Graph": @@ -343,10 +368,10 @@ class GraphBuilder: The newly built graph. """ - if not self._building_ended: + if self._state != CAPTURE_ENDED: raise RuntimeError("Graph has not finished building.") - return _instantiate_graph(self._mnff.graph, options) + return _instantiate_graph(as_py(self._h_graph), options) def debug_dot_print(self, path, options: GraphDebugPrintOptions | None = None): """Generates a DOT debug file for the graph builder. @@ -359,10 +384,14 @@ class GraphBuilder: Customizable dataclass for the debug print options. """ - if not self._building_ended: + if self._state != CAPTURE_ENDED: raise RuntimeError("Graph has not finished building.") - flags = options._to_flags() if options else 0 - handle_return(driver.cuGraphDebugDotPrint(self._mnff.graph, path, flags)) + cdef unsigned int c_flags = options._to_flags() if options else 0 + cdef cydriver.CUgraph c_graph = as_cu(self._h_graph) + cdef bytes b_path = path.encode() if isinstance(path, str) else path + cdef const char* c_path = b_path + with nogil: + HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(c_graph, c_path, c_flags)) def split(self, count: int) -> tuple[GraphBuilder, ...]: """Splits the original graph builder into multiple graph builders. @@ -385,14 +414,12 @@ class GraphBuilder: if count < 2: raise ValueError(f"Invalid split count: expecting >= 2, got {count}") - event = self._mnff.stream.record() + event = self._stream.record() result = [self] for i in range(count - 1): - stream = self._mnff.stream.device.create_stream() + stream = self._stream.device.create_stream() stream.wait(event) - result.append( - GraphBuilder._init(stream=stream, is_stream_owner=True, conditional_graph=None, is_join_required=True) - ) + result.append(_init_forked(stream)) event.close() return tuple(result) @@ -440,7 +467,7 @@ class GraphBuilder: return self.stream.__cuda_stream__() def _get_conditional_context(self) -> driver.CUcontext: - return self._mnff.stream.context.handle + return self._stream.context.handle def create_condition(self, default_value=None) -> GraphCondition: """Create a condition variable for use with conditional nodes. @@ -471,7 +498,7 @@ class GraphBuilder: default_value = 0 flags = 0 - status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle)) + status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._stream.handle)) if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: raise RuntimeError("Cannot create a condition when graph is not being built") @@ -480,42 +507,6 @@ class GraphBuilder: ) return GraphCondition._from_handle(int(raw_handle)) - def _cond_with_params(self, node_params) -> tuple: - # Get current capture info to ensure we're in a valid state - status, _, graph, *deps_info, num_dependencies = handle_return( - driver.cuStreamGetCaptureInfo(self._mnff.stream.handle) - ) - if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: - raise RuntimeError("Cannot add conditional node when not actively capturing") - - # Add the conditional node to the graph - deps_info_update = [ - [handle_return(driver.cuGraphAddNode(graph, *deps_info, num_dependencies, node_params))] - ] + [None] * (len(deps_info) - 1) - - # Update the stream's capture dependencies - handle_return( - driver.cuStreamUpdateCaptureDependencies( - self._mnff.stream.handle, - *deps_info_update, # dependencies, edgeData - 1, # numDependencies - driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES, - ) - ) - - # Create new graph builders for each condition - return tuple( - [ - GraphBuilder._init( - stream=self._mnff.stream.device.create_stream(), - is_stream_owner=True, - conditional_graph=node_params.conditional.phGraph_out[i], - is_join_required=False, - ) - for i in range(node_params.conditional.size) - ] - ) - def if_then(self, condition: GraphCondition) -> GraphBuilder: """Adds an if condition branch and returns a new graph builder for it. @@ -550,7 +541,7 @@ class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() - return self._cond_with_params(node_params)[0] + return _cond_with_params(self, node_params)[0] def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]: """Adds an if-else condition branch and returns new graph builders for both branches. @@ -586,7 +577,7 @@ class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 2 node_params.conditional.ctx = self._get_conditional_context() - return self._cond_with_params(node_params) + return _cond_with_params(self, node_params) def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]: """Adds a switch condition branch and returns new graph builders for all cases. @@ -625,7 +616,7 @@ class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_SWITCH node_params.conditional.size = count node_params.conditional.ctx = self._get_conditional_context() - return self._cond_with_params(node_params) + return _cond_with_params(self, node_params) def while_loop(self, condition: GraphCondition) -> GraphBuilder: """Adds a while loop and returns a new graph builder for it. @@ -661,18 +652,9 @@ class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() - return self._cond_with_params(node_params)[0] - - def close(self): - """Destroy the graph builder. + return _cond_with_params(self, node_params)[0] - Closes the associated stream if we own it. Borrowed stream - object will instead have their references released. - - """ - self._mnff.close() - - def embed(self, child: GraphBuilder): + def embed(self, GraphBuilder child): """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node. Parameters @@ -680,13 +662,13 @@ class GraphBuilder: child : :obj:`~graph.GraphBuilder` The child graph builder. Must have finished building. """ - if not child._building_ended: + if child._state != CAPTURE_ENDED: raise ValueError("Child graph has not finished building.") if not self.is_building: raise ValueError("Parent graph is not being built.") - stream_handle = self._mnff.stream.handle + stream_handle = self._stream.handle _, _, graph_out, *deps_info_out, num_dependencies_out = handle_return( driver.cuStreamGetCaptureInfo(stream_handle) ) @@ -698,7 +680,7 @@ class GraphBuilder: [ handle_return( driver.cuGraphAddChildGraphNode( - graph_out, *deps_info_trimmed, num_dependencies_out, child._mnff.graph + graph_out, *deps_info_trimmed, num_dependencies_out, as_py(child._h_graph) ) ) ] @@ -740,18 +722,13 @@ class GraphBuilder: pointer (caller manages lifetime). If bytes-like, the data is copied and its lifetime is tied to the graph. """ - cdef Stream stream = self._mnff.stream + cdef Stream stream = self._stream cdef cydriver.CUstream c_stream = as_cu(stream._h_stream) cdef cydriver.CUstreamCaptureStatus capture_status cdef cydriver.CUgraph c_graph = NULL with nogil: - IF CUDA_CORE_BUILD_MAJOR >= 13: - HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( - c_stream, &capture_status, NULL, &c_graph, NULL, NULL, NULL)) - ELSE: - HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( - c_stream, &capture_status, NULL, &c_graph, NULL, NULL)) + _get_capture_info(c_stream, &capture_status, &c_graph) if capture_status != cydriver.CU_STREAM_CAPTURE_STATUS_ACTIVE: raise RuntimeError("Cannot add callback when graph is not being built") @@ -764,6 +741,68 @@ class GraphBuilder: HANDLE_RETURN(cydriver.cuLaunchHostFunc(c_stream, c_fn, c_user_data)) +cdef inline GraphBuilder _init_forked(Stream stream): + cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder) + # _h_graph not used for FORKED builders. Captures to primary graph. + gb._h_stream = stream._h_stream + gb._kind = FORKED + gb._state = CAPTURING + gb._stream = stream + return gb + + +cdef inline GraphBuilder _init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent): + cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder) + gb._h_graph = create_graph_handle_ref(cond_graph, parent._h_graph) + gb._h_stream = stream._h_stream + gb._kind = CONDITIONAL_BODY + gb._state = CAPTURE_NOT_STARTED + gb._stream = stream + return gb + + +cdef inline int _get_capture_info( + cydriver.CUstream stream, + cydriver.CUstreamCaptureStatus* status, + cydriver.CUgraph* graph) except?-1 nogil: + IF CUDA_CORE_BUILD_MAJOR >= 13: + return HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( + stream, status, NULL, graph, NULL, NULL, NULL)) + ELSE: + return HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( + stream, status, NULL, graph, NULL, NULL)) + + +cdef inline tuple _cond_with_params(GraphBuilder gb, node_params): + status, _, graph, *deps_info, num_dependencies = handle_return( + driver.cuStreamGetCaptureInfo(gb._stream.handle) + ) + if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: + raise RuntimeError("Cannot add conditional node when not actively capturing") + + deps_info_update = [ + [handle_return(driver.cuGraphAddNode(graph, *deps_info, num_dependencies, node_params))] + ] + [None] * (len(deps_info) - 1) + + handle_return( + driver.cuStreamUpdateCaptureDependencies( + gb._stream.handle, + *deps_info_update, # dependencies, edgeData + 1, # numDependencies + driver.CUstreamUpdateCaptureDependencies_flags.CU_STREAM_SET_CAPTURE_DEPENDENCIES, + ) + ) + + return tuple( + _init_conditional( + gb._stream.device.create_stream(), + int(node_params.conditional.phGraph_out[i]), + gb, + ) + for i in range(node_params.conditional.size) + ) + + class Graph: """An executable graph. @@ -832,9 +871,9 @@ class Graph: cdef cydriver.CUgraphExec cu_exec = int(self._mnff.graph) if isinstance(source, GraphBuilder): - if not source._building_ended: + if (source)._state != CAPTURE_ENDED: raise ValueError("Graph has not finished building.") - cu_graph = int(source._mnff.graph) + cu_graph = as_cu((source)._h_graph) elif isinstance(source, GraphDefinition): cu_graph = int(source.handle) else: From 035b09d89740e4b21f496c68b3158eafcd152ccc Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 1 May 2026 15:17:19 -0700 Subject: [PATCH 2/9] cuda.core: convert Graph to cdef class with GraphExecHandle Add a GraphExecHandle to the resource-handle layer (parallel to GraphHandle) wrapping CUgraphExec with RAII cleanup via cuGraphExecDestroy on shared_ptr release. Convert Graph from a Python class using _MembersNeededForFinalize to a cdef class holding a typed _h_graph_exec attribute, dropping the weakref.finalize machinery. update/upload/launch move to nogil cydriver paths consistent with the GraphBuilder rewrite. Also drop quoted forward-reference annotations on create_graph_builder and _instantiate_graph/complete now that GraphBuilder is cimported in _device.pyx and _stream.pyx and Cython accepts the in-module forward reference to Graph. Clears the related "Strings should no longer be used for type declarations" warnings. Made-with: Cursor --- cuda_core/cuda/core/_cpp/resource_handles.cpp | 23 ++++++++ cuda_core/cuda/core/_cpp/resource_handles.hpp | 22 ++++++++ cuda_core/cuda/core/_device.pyx | 2 +- cuda_core/cuda/core/_resource_handles.pxd | 7 +++ cuda_core/cuda/core/_resource_handles.pyx | 7 +++ cuda_core/cuda/core/_stream.pyx | 2 +- cuda_core/cuda/core/graph/_graph_builder.pxd | 13 ++++- cuda_core/cuda/core/graph/_graph_builder.pyx | 54 +++++++++---------- 8 files changed, 97 insertions(+), 33 deletions(-) diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index 5eb4716b981..029ac46d661 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -63,6 +63,7 @@ decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel = nullptr; // Graph decltype(&cuGraphDestroy) p_cuGraphDestroy = nullptr; +decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy = nullptr; // Linker decltype(&cuLinkDestroy) p_cuLinkDestroy = nullptr; @@ -952,6 +953,28 @@ GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent) return GraphHandle(box, &box->resource); } +// ============================================================================ +// Graph Exec Handles +// ============================================================================ + +namespace { +struct GraphExecBox { + CUgraphExec resource; +}; +} // namespace + +GraphExecHandle create_graph_exec_handle(CUgraphExec graph_exec) { + auto box = std::shared_ptr( + new GraphExecBox{graph_exec}, + [](const GraphExecBox* b) { + GILReleaseGuard gil; + p_cuGraphExecDestroy(b->resource); + delete b; + } + ); + return GraphExecHandle(box, &box->resource); +} + namespace { struct GraphNodeBox { mutable CUgraphNode resource; diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 2e6ebb6271c..14bd2a0bc4d 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -94,6 +94,7 @@ extern decltype(&cuLibraryGetKernel) p_cuLibraryGetKernel; // Graph extern decltype(&cuGraphDestroy) p_cuGraphDestroy; +extern decltype(&cuGraphExecDestroy) p_cuGraphExecDestroy; // Linker extern decltype(&cuLinkDestroy) p_cuLinkDestroy; @@ -148,6 +149,7 @@ using MemoryPoolHandle = std::shared_ptr; using LibraryHandle = std::shared_ptr; using KernelHandle = std::shared_ptr; using GraphHandle = std::shared_ptr; +using GraphExecHandle = std::shared_ptr; using GraphNodeHandle = std::shared_ptr; using GraphicsResourceHandle = std::shared_ptr; using NvrtcProgramHandle = std::shared_ptr; @@ -403,6 +405,14 @@ GraphHandle create_graph_handle(CUgraph graph); // but h_parent will be prevented from destruction while this handle exists. GraphHandle create_graph_handle_ref(CUgraph graph, const GraphHandle& h_parent); +// ============================================================================ +// Graph exec handle functions +// ============================================================================ + +// Wrap an externally-created CUgraphExec with RAII cleanup. +// When the last reference is released, cuGraphExecDestroy is called automatically. +GraphExecHandle create_graph_exec_handle(CUgraphExec graph_exec); + // ============================================================================ // Graph node handle functions // ============================================================================ @@ -529,6 +539,10 @@ inline CUgraph as_cu(const GraphHandle& h) noexcept { return h ? *h : nullptr; } +inline CUgraphExec as_cu(const GraphExecHandle& h) noexcept { + return h ? *h : nullptr; +} + inline CUgraphNode as_cu(const GraphNodeHandle& h) noexcept { return h ? *h : nullptr; } @@ -587,6 +601,10 @@ inline std::intptr_t as_intptr(const GraphHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } +inline std::intptr_t as_intptr(const GraphExecHandle& h) noexcept { + return reinterpret_cast(as_cu(h)); +} + inline std::intptr_t as_intptr(const GraphNodeHandle& h) noexcept { return reinterpret_cast(as_cu(h)); } @@ -677,6 +695,10 @@ inline PyObject* as_py(const GraphHandle& h) noexcept { return detail::make_py("cuda.bindings.driver", "CUgraph", as_intptr(h)); } +inline PyObject* as_py(const GraphExecHandle& h) noexcept { + return detail::make_py("cuda.bindings.driver", "CUgraphExec", as_intptr(h)); +} + inline PyObject* as_py(const GraphNodeHandle& h) noexcept { if (!as_intptr(h)) { Py_RETURN_NONE; diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index d9776a72e8c..269816b0259 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -1362,7 +1362,7 @@ class Device: self._check_context_initialized() handle_return(runtime.cudaDeviceSynchronize()) - def create_graph_builder(self) -> "GraphBuilder": + def create_graph_builder(self) -> GraphBuilder: """Create a new :obj:`~graph.GraphBuilder` object. Returns diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 0d7d20e574c..a059465403e 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -27,6 +27,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": ctypedef shared_ptr[const cydriver.CUlibrary] LibraryHandle ctypedef shared_ptr[const cydriver.CUkernel] KernelHandle ctypedef shared_ptr[const cydriver.CUgraph] GraphHandle + ctypedef shared_ptr[const cydriver.CUgraphExec] GraphExecHandle ctypedef shared_ptr[const cydriver.CUgraphNode] GraphNodeHandle ctypedef shared_ptr[const cydriver.CUgraphicsResource] GraphicsResourceHandle ctypedef shared_ptr[const cynvrtc.nvrtcProgram] NvrtcProgramHandle @@ -52,6 +53,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": cydriver.CUlibrary as_cu(LibraryHandle h) noexcept nogil cydriver.CUkernel as_cu(KernelHandle h) noexcept nogil cydriver.CUgraph as_cu(GraphHandle h) noexcept nogil + cydriver.CUgraphExec as_cu(GraphExecHandle h) noexcept nogil cydriver.CUgraphNode as_cu(GraphNodeHandle h) noexcept nogil cydriver.CUgraphicsResource as_cu(GraphicsResourceHandle h) noexcept nogil cynvrtc.nvrtcProgram as_cu(NvrtcProgramHandle h) noexcept nogil @@ -68,6 +70,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": intptr_t as_intptr(LibraryHandle h) noexcept nogil intptr_t as_intptr(KernelHandle h) noexcept nogil intptr_t as_intptr(GraphHandle h) noexcept nogil + intptr_t as_intptr(GraphExecHandle h) noexcept nogil intptr_t as_intptr(GraphNodeHandle h) noexcept nogil intptr_t as_intptr(GraphicsResourceHandle h) noexcept nogil intptr_t as_intptr(NvrtcProgramHandle h) noexcept nogil @@ -85,6 +88,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": object as_py(LibraryHandle h) object as_py(KernelHandle h) object as_py(GraphHandle h) + object as_py(GraphExecHandle h) object as_py(GraphNodeHandle h) object as_py(GraphicsResourceHandle h) object as_py(NvrtcProgramHandle h) @@ -183,6 +187,9 @@ cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil cdef GraphHandle create_graph_handle(cydriver.CUgraph graph) except+ nogil cdef GraphHandle create_graph_handle_ref(cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil +# Graph exec handles +cdef GraphExecHandle create_graph_exec_handle(cydriver.CUgraphExec graph_exec) except+ nogil + # Graph node handles cdef GraphNodeHandle create_graph_node_handle(cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil cdef GraphHandle graph_node_get_graph(const GraphNodeHandle& h) noexcept nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index d30993cc5e8..2291b1ec20f 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -27,6 +27,7 @@ from ._resource_handles cimport ( LibraryHandle, KernelHandle, GraphHandle, + GraphExecHandle, GraphicsResourceHandle, NvrtcProgramHandle, NvvmProgramHandle, @@ -154,6 +155,10 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": GraphHandle create_graph_handle_ref "cuda_core::create_graph_handle_ref" ( cydriver.CUgraph graph, const GraphHandle& h_parent) except+ nogil + # Graph exec handles + GraphExecHandle create_graph_exec_handle "cuda_core::create_graph_exec_handle" ( + cydriver.CUgraphExec graph_exec) except+ nogil + # Graph node handles GraphNodeHandle create_graph_node_handle "cuda_core::create_graph_node_handle" ( cydriver.CUgraphNode node, const GraphHandle& h_graph) except+ nogil @@ -265,6 +270,7 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Graph void* p_cuGraphDestroy "reinterpret_cast(cuda_core::p_cuGraphDestroy)" + void* p_cuGraphExecDestroy "reinterpret_cast(cuda_core::p_cuGraphExecDestroy)" # Linker void* p_cuLinkDestroy "reinterpret_cast(cuda_core::p_cuLinkDestroy)" @@ -334,6 +340,7 @@ p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel") # Graph p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy") +p_cuGraphExecDestroy = _get_driver_fn("cuGraphExecDestroy") # Linker p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index c7b1312c17a..caf56ee136c 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -361,7 +361,7 @@ cdef class Stream: return Stream._init(obj=_stream_holder()) - def create_graph_builder(self) -> "GraphBuilder": + def create_graph_builder(self) -> GraphBuilder: """Create a new :obj:`~graph.GraphBuilder` object. The new graph builder will be associated with this stream. diff --git a/cuda_core/cuda/core/graph/_graph_builder.pxd b/cuda_core/cuda/core/graph/_graph_builder.pxd index e224f3a5109..c33a7d63c19 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pxd +++ b/cuda_core/cuda/core/graph/_graph_builder.pxd @@ -2,7 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core._resource_handles cimport GraphHandle, StreamHandle +from cuda.bindings cimport cydriver + +from cuda.core._resource_handles cimport GraphExecHandle, GraphHandle, StreamHandle from cuda.core._stream cimport Stream @@ -17,3 +19,12 @@ cdef class GraphBuilder: @staticmethod cdef GraphBuilder _init(Stream stream) + + +cdef class Graph: + cdef: + GraphExecHandle _h_graph_exec + object __weakref__ + + @staticmethod + cdef Graph _init(cydriver.CUgraphExec graph_exec) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index 61629dcc2e5..f02a0409d86 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import weakref from dataclasses import dataclass from libc.stdint cimport intptr_t @@ -12,8 +11,8 @@ from cuda.bindings cimport cydriver from cuda.core.graph._graph_definition cimport GraphCondition from cuda.core.graph._utils cimport _attach_host_callback_to_graph from cuda.core._resource_handles cimport ( - GraphHandle, StreamHandle, as_cu, as_py, - create_graph_handle, create_graph_handle_ref, + GraphExecHandle, GraphHandle, StreamHandle, as_cu, as_py, + create_graph_exec_handle, create_graph_handle, create_graph_handle_ref, ) from cuda.core._stream cimport Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -150,7 +149,8 @@ class GraphCompleteOptions: use_node_priority: bool = False -def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> "Graph": +def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> Graph: + cdef cydriver.CUgraphExec c_exec params = driver.CUDA_GRAPH_INSTANTIATE_PARAMS() if options: flags = 0 @@ -165,7 +165,9 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> flags |= driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY params.flags = flags - graph = Graph._init(handle_return(driver.cuGraphInstantiateWithParams(h_graph, params))) + py_exec = handle_return(driver.cuGraphInstantiateWithParams(h_graph, params)) + c_exec = int(py_exec) + graph = Graph._init(c_exec) if params.result_out == driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_ERROR: raise RuntimeError( "Instantiation failed for an unexpected reason which is described in the return value of the function." @@ -354,7 +356,7 @@ cdef class GraphBuilder: self._state = CAPTURE_ENDED return self - def complete(self, options: GraphCompleteOptions | None = None) -> "Graph": + def complete(self, options: GraphCompleteOptions | None = None) -> Graph: """Completes the graph builder and returns the built :obj:`~graph.Graph` object. Parameters @@ -803,7 +805,7 @@ cdef inline tuple _cond_with_params(GraphBuilder gb, node_params): ) -class Graph: +cdef class Graph: """An executable graph. A graph groups a set of CUDA kernels and other CUDA operations together and executes @@ -814,32 +816,18 @@ class Graph: """ - class _MembersNeededForFinalize: - __slots__ = "graph" - - def __init__(self, graph_obj, graph): - self.graph = graph - weakref.finalize(graph_obj, self.close) - - def close(self): - if self.graph: - handle_return(driver.cuGraphExecDestroy(self.graph)) - self.graph = None - - __slots__ = ("__weakref__", "_mnff") - def __init__(self): raise RuntimeError("directly constructing a Graph instance is not supported") - @classmethod - def _init(cls, graph): - self = cls.__new__(cls) - self._mnff = Graph._MembersNeededForFinalize(self, graph) + @staticmethod + cdef Graph _init(cydriver.CUgraphExec graph_exec): + cdef Graph self = Graph.__new__(Graph) + self._h_graph_exec = create_graph_exec_handle(graph_exec) return self def close(self): """Destroy the graph.""" - self._mnff.close() + self._h_graph_exec.reset() @property def handle(self) -> driver.CUgraphExec: @@ -851,7 +839,7 @@ class Graph: handle, call ``int()`` on the returned object. """ - return self._mnff.graph + return as_py(self._h_graph_exec) def update(self, source: "GraphBuilder | GraphDefinition") -> None: """Update the graph using a new graph definition. @@ -868,7 +856,7 @@ class Graph: from cuda.core.graph import GraphDefinition cdef cydriver.CUgraph cu_graph - cdef cydriver.CUgraphExec cu_exec = int(self._mnff.graph) + cdef cydriver.CUgraphExec cu_exec = as_cu(self._h_graph_exec) if isinstance(source, GraphBuilder): if (source)._state != CAPTURE_ENDED: @@ -899,7 +887,10 @@ class Graph: The stream in which to upload the graph """ - handle_return(driver.cuGraphUpload(self._mnff.graph, stream.handle)) + cdef cydriver.CUgraphExec c_exec = as_cu(self._h_graph_exec) + cdef cydriver.CUstream c_stream = int(stream.handle) + with nogil: + HANDLE_RETURN(cydriver.cuGraphUpload(c_exec, c_stream)) def launch(self, stream: Stream): """Launches the graph in a stream. @@ -910,4 +901,7 @@ class Graph: The stream in which to launch the graph """ - handle_return(driver.cuGraphLaunch(self._mnff.graph, stream.handle)) + cdef cydriver.CUgraphExec c_exec = as_cu(self._h_graph_exec) + cdef cydriver.CUstream c_stream = int(stream.handle) + with nogil: + HANDLE_RETURN(cydriver.cuGraphLaunch(c_exec, c_stream)) From ae974af76bc04d95de7882650785aa83334da705 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 1 May 2026 15:19:59 -0700 Subject: [PATCH 3/9] fix(cuda.core): drop unused handle cimports flagged by cython-lint The cdef-class member declarations live in the .pxd, so the .pyx does not need to re-cimport GraphExecHandle, GraphHandle, or StreamHandle. Made-with: Cursor --- cuda_core/cuda/core/graph/_graph_builder.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index f02a0409d86..f27fbb4a06d 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -11,7 +11,7 @@ from cuda.bindings cimport cydriver from cuda.core.graph._graph_definition cimport GraphCondition from cuda.core.graph._utils cimport _attach_host_callback_to_graph from cuda.core._resource_handles cimport ( - GraphExecHandle, GraphHandle, StreamHandle, as_cu, as_py, + as_cu, as_py, create_graph_exec_handle, create_graph_handle, create_graph_handle_ref, ) from cuda.core._stream cimport Stream From 4e71a1e845cf5049d2f9492ecde45306d6fe30a2 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 1 May 2026 15:41:18 -0700 Subject: [PATCH 4/9] fix(cuda.core): break _stream/_device <-> graph._graph_builder import cycle cimport-ing GraphBuilder at the top of _stream.pyx and _device.pyx made Cython emit a Python-level import of cuda.core.graph._graph_builder during _stream module init. That triggered the chain graph -> _graph_node -> _kernel_arg_handler -> _memory._buffer -> _device, which then re-entered the still-initializing _stream module via "from cuda.core._stream import IsStreamT", failing with ImportError: cannot import name IsStreamT. Restore the original lazy "import GraphBuilder" inside create_graph_builder (Stream and Device) and Stream_accept. The return annotations stay as bare names; "from __future__ import annotations" in both files defers their evaluation, so they need not resolve at function-definition time. Made-with: Cursor --- cuda_core/cuda/core/_device.pyx | 3 ++- cuda_core/cuda/core/_stream.pyx | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 269816b0259..c4fba83006f 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -14,7 +14,6 @@ import threading from cuda.core._context cimport Context from cuda.core._context import ContextOptions from cuda.core._event cimport Event as cyEvent -from cuda.core.graph._graph_builder cimport GraphBuilder from cuda.core._event import Event, EventOptions from cuda.core._memory._buffer cimport Buffer, MemoryResource from cuda.core._resource_handles cimport ( @@ -1371,6 +1370,8 @@ class Device: Newly created graph builder object. """ + from cuda.core.graph._graph_builder import GraphBuilder + self._check_context_initialized() return GraphBuilder._init(self.create_stream()) diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index caf56ee136c..a2bf0e025c0 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -10,7 +10,6 @@ from libc.stdlib cimport strtol, getenv from cuda.bindings cimport cydriver from cuda.core._event cimport Event as cyEvent -from cuda.core.graph._graph_builder cimport GraphBuilder from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, @@ -372,6 +371,8 @@ cdef class Stream: Newly created graph builder object. """ + from cuda.core.graph._graph_builder import GraphBuilder + return GraphBuilder._init(self) @@ -473,6 +474,8 @@ cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*: # Helper for API functions that accept either Stream or GraphBuilder. Performs # needed checks and returns the relevant stream. cdef Stream Stream_accept(arg, bint allow_stream_protocol=False): + from cuda.core.graph._graph_builder import GraphBuilder + if isinstance(arg, Stream): return (arg) elif isinstance(arg, GraphBuilder): From 343eccad1ed3156455c3bcd352c125be6132449c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 4 May 2026 12:13:20 -0700 Subject: [PATCH 5/9] fix(cuda.core): expose GraphBuilder._init as a Python-callable factory The previous import-cycle fix changed _stream/_device.create_graph_builder to a lazy Python "import GraphBuilder" instead of a module-level cimport. With _init declared as @staticmethod cdef, Python attribute lookup cannot find it, so every test that builds a graph failed with "AttributeError: type object 'GraphBuilder' has no attribute '_init'" at _device.pyx:1376 / _stream.pyx:376. Convert _init from @staticmethod cdef to @staticmethod def (matches the Stream._init pattern) and drop the cdef declaration from the .pxd. _init runs once per builder creation, so the loss of cdef-level dispatch is irrelevant. Graph._init stays cdef; it is only called intra-module. Made-with: Cursor --- cuda_core/cuda/core/graph/_graph_builder.pxd | 3 --- cuda_core/cuda/core/graph/_graph_builder.pyx | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pxd b/cuda_core/cuda/core/graph/_graph_builder.pxd index c33a7d63c19..660ebe8ec7d 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pxd +++ b/cuda_core/cuda/core/graph/_graph_builder.pxd @@ -17,9 +17,6 @@ cdef class GraphBuilder: Stream _stream # cached to avoid reconstruction from _h_stream handle object __weakref__ - @staticmethod - cdef GraphBuilder _init(Stream stream) - cdef class Graph: cdef: diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index f27fbb4a06d..d9e777b2cab 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -250,7 +250,7 @@ cdef class GraphBuilder: cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL) @staticmethod - cdef GraphBuilder _init(Stream stream): + def _init(Stream stream): cdef GraphBuilder self = GraphBuilder.__new__(GraphBuilder) # _h_graph set by begin_building self._h_stream = stream._h_stream From 9465c768cc4dd0f8c03ea1bab9b17b72070819fe Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 5 May 2026 10:35:43 -0700 Subject: [PATCH 6/9] fix(cuda.core): pass non-NULL captureStatus to cuStreamGetCaptureInfo Every graph-builder test failed with CUDA_ERROR_INVALID_VALUE on the new ``GraphBuilder.begin_building`` path. The driver rejects ``cuStreamGetCaptureInfo`` when ``captureStatus_out`` is NULL, but the new ``_get_capture_info`` helper accepted a NULL status pointer and ``begin_building`` was calling it that way (it just wanted the freshly captured graph handle and assumed the status was implied by the preceding ``cuStreamBeginCapture``). Pass a stack-local ``CUstreamCaptureStatus`` and document the helper's requirement that ``status`` be non-NULL. ``graph`` is still allowed to be NULL (``is_building`` calls it that way and the driver accepts it). Co-authored-by: Cursor --- cuda_core/cuda/core/graph/_graph_builder.pyx | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index d9e777b2cab..63611423fc5 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -312,6 +312,7 @@ cdef class GraphBuilder: cdef cydriver.CUstream c_stream = as_cu(self._h_stream) cdef cydriver.CUgraph c_graph + cdef cydriver.CUstreamCaptureStatus c_status if self._kind == CONDITIONAL_BODY: c_graph = as_cu(self._h_graph) with nogil: @@ -320,7 +321,10 @@ cdef class GraphBuilder: else: with nogil: HANDLE_RETURN(cydriver.cuStreamBeginCapture(c_stream, c_mode)) - _get_capture_info(c_stream, NULL, &c_graph) + # The driver rejects NULL captureStatus_out, so we pass a + # stack-local even though begin_capture just succeeded and we + # only care about the resulting graph handle. + _get_capture_info(c_stream, &c_status, &c_graph) self._h_graph = create_graph_handle(c_graph) self._state = CAPTURING return self @@ -767,6 +771,13 @@ cdef inline int _get_capture_info( cydriver.CUstream stream, cydriver.CUstreamCaptureStatus* status, cydriver.CUgraph* graph) except?-1 nogil: + """Thin wrapper around ``cuStreamGetCaptureInfo`` that papers over the + CUDA 12 vs 13 signature change. + + ``status`` must be non-NULL: the driver rejects ``captureStatus_out=NULL`` + with ``CUDA_ERROR_INVALID_VALUE``. ``graph`` may be NULL when the caller + does not need the graph handle. + """ IF CUDA_CORE_BUILD_MAJOR >= 13: return HANDLE_RETURN(cydriver.cuStreamGetCaptureInfo( stream, status, NULL, graph, NULL, NULL, NULL)) From 782306899a5c12608dd78529f80ebe80e6bfaa5b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 15 Jun 2026 10:21:33 -0700 Subject: [PATCH 7/9] test: verify Graph.close() is idempotent (Glasswing V18.1) Add coverage that repeated close() does not double-destroy the graph exec handle. Addresses NVBugs 6268912 / cuda-python-private#370 via the handle-layer refactor in this PR. --- cuda_core/tests/graph/test_graph_builder.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index c0299df5661..c2f87cc8ae4 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -259,6 +259,21 @@ def test_graph_child_graph(init_cuda): b.close() +def test_graph_close_is_idempotent(init_cuda): + """Re-entrant close must not double-destroy the graph exec (Glasswing V18.1).""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + graph = gb.end_building().complete() + gb.close() + + graph.close() + graph.close() + assert int(graph.handle) == 0 + + def test_graph_stream_lifetime(init_cuda): mod = compile_common_kernels() empty_kernel = mod.get_kernel("empty_kernel") From 564a2bebce92f1f2daae97bf35b9123cdd5e5b8e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 24 Jun 2026 10:53:55 -0700 Subject: [PATCH 8/9] cuda.core: harden GraphBuilder state machine and expand graph tests Add CLOSED state with GB_check_open guards, fix begin_building ordering so capture cleanup runs on partial failure, preserve FORKED primary-graph refs for conditional nodes, and defer Graph._init until instantiate succeeds. Add tests for Leo's review gaps including closed-builder errors and forked conditionals. --- cuda_core/cuda/core/graph/_graph_builder.pyx | 116 +++++++++++++----- cuda_core/tests/graph/test_graph_builder.py | 80 ++++++++++++ .../graph/test_graph_builder_conditional.py | 94 ++++++++++++++ 3 files changed, 262 insertions(+), 28 deletions(-) diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index 63611423fc5..b1e5e3da51f 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -11,6 +11,7 @@ from cuda.bindings cimport cydriver from cuda.core.graph._graph_definition cimport GraphCondition from cuda.core.graph._utils cimport _attach_host_callback_to_graph from cuda.core._resource_handles cimport ( + GraphHandle, as_cu, as_py, create_graph_exec_handle, create_graph_handle, create_graph_handle_ref, ) @@ -166,8 +167,9 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> params.flags = flags py_exec = handle_return(driver.cuGraphInstantiateWithParams(h_graph, params)) - c_exec = int(py_exec) - graph = Graph._init(c_exec) + # Check result_out before wrapping the exec: on a non-SUCCESS result the exec + # may be invalid, and Graph._init's RAII deleter would call cuGraphExecDestroy + # on it during the exception unwind below. if params.result_out == driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_ERROR: raise RuntimeError( "Instantiation failed for an unexpected reason which is described in the return value of the function." @@ -187,7 +189,9 @@ def _instantiate_graph(h_graph, options: GraphCompleteOptions | None = None) -> raise RuntimeError("One or more conditional handles are not associated with conditional builders.") elif params.result_out != driver.CUgraphInstantiateResult.CUDA_GRAPH_INSTANTIATE_SUCCESS: raise RuntimeError(f"Graph instantiation failed with unexpected error code: {params.result_out}") - return graph + + c_exec = int(py_exec) + return Graph._init(c_exec) # Distinguishes the three kinds of GraphBuilder, which differ in how they @@ -220,7 +224,8 @@ cdef enum _BuilderKind: cdef enum _CaptureState: CAPTURE_NOT_STARTED = 0 CAPTURING = 1 - CAPTURE_ENDED = 2 + CAPTURE_ENDED = 2 # Finished, valid handle + CLOSED = 3 # No valid handle cdef class GraphBuilder: @@ -238,16 +243,12 @@ cdef class GraphBuilder: def __init__(self): raise NotImplementedError( - "directly creating a Graph object can be ambiguous. Please either " + "directly creating a GraphBuilder object can be ambiguous. Please either " "call Device.create_graph_builder() or stream.create_graph_builder()" ) def __dealloc__(self): - # Note: _stream could be set to None by cyclic-GC tp_clear before - # __dealloc__, but _h_stream is guaranteed to be valid. - if self._h_stream and self._state == CAPTURING and self._kind != FORKED: - with nogil: - cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL) + GB_end_capture_if_needed(self, False) @staticmethod def _init(Stream stream): @@ -261,12 +262,11 @@ cdef class GraphBuilder: def close(self): """Destroy the graph builder.""" - if self._h_stream and self._state == CAPTURING and self._kind != FORKED: - with nogil: - HANDLE_RETURN(cydriver.cuStreamEndCapture(as_cu(self._h_stream), NULL)) + with nogil: + GB_end_capture_if_needed(self, True) self._h_graph.reset() self._h_stream.reset() - self._state = CAPTURE_ENDED + self._state = CLOSED self._stream = None @property @@ -295,6 +295,7 @@ cdef class GraphBuilder: Default set to use relaxed. """ + GB_check_open(self) if self._state != CAPTURE_NOT_STARTED: if self._state == CAPTURING: raise RuntimeError("Graph builder is already building.") @@ -318,20 +319,25 @@ cdef class GraphBuilder: with nogil: HANDLE_RETURN(cydriver.cuStreamBeginCaptureToGraph( c_stream, c_graph, NULL, NULL, 0, c_mode)) + self._state = CAPTURING else: with nogil: HANDLE_RETURN(cydriver.cuStreamBeginCapture(c_stream, c_mode)) - # The driver rejects NULL captureStatus_out, so we pass a - # stack-local even though begin_capture just succeeded and we - # only care about the resulting graph handle. + # Capture is active now; set CAPTURING before the calls below so a + # failure in _get_capture_info/create_graph_handle still lets + # cleanup end the capture rather than leaving the stream poisoned. + self._state = CAPTURING + with nogil: + # The driver rejects a NULL captureStatus_out, so pass a + # stack-local even though we only want the graph handle. _get_capture_info(c_stream, &c_status, &c_graph) self._h_graph = create_graph_handle(c_graph) - self._state = CAPTURING return self @property def is_building(self) -> bool: """Returns True if the graph builder is currently building.""" + GB_check_open(self) cdef cydriver.CUstream c_stream = as_cu(self._h_stream) cdef cydriver.CUstreamCaptureStatus status with nogil: @@ -349,6 +355,7 @@ cdef class GraphBuilder: def end_building(self) -> GraphBuilder: """Ends the building process.""" + GB_check_open(self) if not self.is_building: raise RuntimeError("Graph builder is not building.") cdef cydriver.CUstream c_stream = as_cu(self._h_stream) @@ -374,6 +381,7 @@ cdef class GraphBuilder: The newly built graph. """ + GB_check_open(self) if self._state != CAPTURE_ENDED: raise RuntimeError("Graph has not finished building.") @@ -390,6 +398,7 @@ cdef class GraphBuilder: Customizable dataclass for the debug print options. """ + GB_check_open(self) if self._state != CAPTURE_ENDED: raise RuntimeError("Graph has not finished building.") cdef unsigned int c_flags = options._to_flags() if options else 0 @@ -419,13 +428,16 @@ cdef class GraphBuilder: """ if count < 2: raise ValueError(f"Invalid split count: expecting >= 2, got {count}") + GB_check_open(self) + if self._state != CAPTURING: + raise RuntimeError("Graph builder must be building before it can be split.") event = self._stream.record() result = [self] for i in range(count - 1): stream = self._stream.device.create_stream() stream.wait(event) - result.append(_init_forked(stream)) + result.append(GB_init_forked(stream, self._h_graph)) event.close() return tuple(result) @@ -470,6 +482,7 @@ cdef class GraphBuilder: def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" + GB_check_open(self) return self.stream.__cuda_stream__() def _get_conditional_context(self) -> driver.CUcontext: @@ -494,6 +507,7 @@ cdef class GraphBuilder: GraphCondition A condition variable for controlling conditional execution. """ + GB_check_open(self) if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional handles") if cy_binding_version() < (12, 3, 0): @@ -533,6 +547,7 @@ cdef class GraphBuilder: The newly created conditional graph builder. """ + GB_check_open(self) if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if") if cy_binding_version() < (12, 3, 0): @@ -547,7 +562,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() - return _cond_with_params(self, node_params)[0] + return GB_cond_with_params(self, node_params)[0] def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]: """Adds an if-else condition branch and returns new graph builders for both branches. @@ -569,6 +584,7 @@ cdef class GraphBuilder: A tuple of two new graph builders, one for the if branch and one for the else branch. """ + GB_check_open(self) if cy_driver_version() < (12, 8, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if-else") if cy_binding_version() < (12, 8, 0): @@ -583,7 +599,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 2 node_params.conditional.ctx = self._get_conditional_context() - return _cond_with_params(self, node_params) + return GB_cond_with_params(self, node_params) def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]: """Adds a switch condition branch and returns new graph builders for all cases. @@ -608,6 +624,7 @@ cdef class GraphBuilder: A tuple of new graph builders, one for each branch. """ + GB_check_open(self) if cy_driver_version() < (12, 8, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional switch") if cy_binding_version() < (12, 8, 0): @@ -622,7 +639,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_SWITCH node_params.conditional.size = count node_params.conditional.ctx = self._get_conditional_context() - return _cond_with_params(self, node_params) + return GB_cond_with_params(self, node_params) def while_loop(self, condition: GraphCondition) -> GraphBuilder: """Adds a while loop and returns a new graph builder for it. @@ -644,6 +661,7 @@ cdef class GraphBuilder: The newly created while loop graph builder. """ + GB_check_open(self) if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional while loop") if cy_binding_version() < (12, 3, 0): @@ -658,7 +676,7 @@ cdef class GraphBuilder: node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() - return _cond_with_params(self, node_params)[0] + return GB_cond_with_params(self, node_params)[0] def embed(self, GraphBuilder child): """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node. @@ -668,6 +686,7 @@ cdef class GraphBuilder: child : :obj:`~graph.GraphBuilder` The child graph builder. Must have finished building. """ + GB_check_open(self) if child._state != CAPTURE_ENDED: raise ValueError("Child graph has not finished building.") @@ -728,6 +747,7 @@ cdef class GraphBuilder: pointer (caller manages lifetime). If bytes-like, the data is copied and its lifetime is tied to the graph. """ + GB_check_open(self) cdef Stream stream = self._stream cdef cydriver.CUstream c_stream = as_cu(stream._h_stream) cdef cydriver.CUstreamCaptureStatus capture_status @@ -747,9 +767,47 @@ cdef class GraphBuilder: HANDLE_RETURN(cydriver.cuLaunchHostFunc(c_stream, c_fn, c_user_data)) -cdef inline GraphBuilder _init_forked(Stream stream): +cdef inline int GB_check_open(GraphBuilder gb) except -1: + """Reject operations on a builder that has been closed. + + A CLOSED builder has reset its stream and graph handles, so any method + that dereferences them would read a null handle (or, for the cached + Stream, a None typed as cdef Stream). Guarding here yields a clear error + instead. + """ + if gb._state == CLOSED: + raise RuntimeError("Graph builder has been closed.") + return 0 + + +cdef inline int GB_end_capture_if_needed(GraphBuilder gb, bint check_status) except -1 nogil: + """End an in-progress capture if this builder owns it. + + Only a CAPTURING PRIMARY or CONDITIONAL_BODY builder owns the live + capture. A FORKED builder must not call cuStreamEndCapture: the driver + requires forked streams to be joined first. + + A NULL phGraph ends the capture and discards the graph; the driver + guards every write to phGraph (cuapiStreamEndCaptureCommon). + + check_status=True checks the driver return (close()); False ignores it + (__dealloc__). + """ + if gb._h_stream and gb._state == CAPTURING and gb._kind != FORKED: + if check_status: + HANDLE_RETURN(cydriver.cuStreamEndCapture(as_cu(gb._h_stream), NULL)) + else: + cydriver.cuStreamEndCapture(as_cu(gb._h_stream), NULL) + return 0 + + +cdef inline GraphBuilder GB_init_forked(Stream stream, GraphHandle h_primary_graph): cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder) - # _h_graph not used for FORKED builders. Captures to primary graph. + # A FORKED builder captures into the primary's CUgraph. It holds the + # primary's GraphHandle so conditional bodies created on it (via + # GB_init_conditional -> create_graph_handle_ref(cond_graph, parent._h_graph)) + # have a valid parent handle to pin. + gb._h_graph = h_primary_graph gb._h_stream = stream._h_stream gb._kind = FORKED gb._state = CAPTURING @@ -757,7 +815,7 @@ cdef inline GraphBuilder _init_forked(Stream stream): return gb -cdef inline GraphBuilder _init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent): +cdef inline GraphBuilder GB_init_conditional(Stream stream, cydriver.CUgraph cond_graph, GraphBuilder parent): cdef GraphBuilder gb = GraphBuilder.__new__(GraphBuilder) gb._h_graph = create_graph_handle_ref(cond_graph, parent._h_graph) gb._h_stream = stream._h_stream @@ -786,7 +844,7 @@ cdef inline int _get_capture_info( stream, status, NULL, graph, NULL, NULL)) -cdef inline tuple _cond_with_params(GraphBuilder gb, node_params): +cdef inline tuple GB_cond_with_params(GraphBuilder gb, node_params): status, _, graph, *deps_info, num_dependencies = handle_return( driver.cuStreamGetCaptureInfo(gb._stream.handle) ) @@ -807,7 +865,7 @@ cdef inline tuple _cond_with_params(GraphBuilder gb, node_params): ) return tuple( - _init_conditional( + GB_init_conditional( gb._stream.device.create_stream(), int(node_params.conditional.phGraph_out[i]), gb, @@ -870,6 +928,8 @@ cdef class Graph: cdef cydriver.CUgraphExec cu_exec = as_cu(self._h_graph_exec) if isinstance(source, GraphBuilder): + if (source)._state == CLOSED: + raise ValueError("Source graph builder has been closed.") if (source)._state != CAPTURE_ENDED: raise ValueError("Graph has not finished building.") cu_graph = as_cu((source)._h_graph) diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index c2f87cc8ae4..8f12abcf226 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -166,6 +166,86 @@ def test_graph_capture_errors(init_cuda): gb.end_building().complete() +def test_graph_begin_building_twice(init_cuda): + """Calling begin_building() while already capturing is a clear error.""" + gb = Device().create_graph_builder() + gb.begin_building() + with pytest.raises(RuntimeError, match="^Graph builder is already building."): + gb.begin_building() + gb.end_building() + + +def test_graph_split_requires_building(init_cuda): + """A builder must be capturing before it can be split.""" + gb = Device().create_graph_builder() + with pytest.raises(RuntimeError, match="^Graph builder must be building before it can be split."): + gb.split(2) + + +def test_graph_complete_after_close_forked(init_cuda): + """complete() on a forked builder closed via join() must not deref a null handle.""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + left, right = gb.split(2) + launch(left, LaunchConfig(grid=1, block=1), empty_kernel) + launch(right, LaunchConfig(grid=1, block=1), empty_kernel) + + # join() closes the non-root builder (right); it must now be rejected, not crash. + GraphBuilder.join(left, right) + with pytest.raises(RuntimeError, match="^Graph builder has been closed."): + right.complete() + + +def test_graph_update_after_source_close(init_cuda): + """Graph.update() with a closed source builder must raise, not deref a null handle.""" + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + graph = gb.end_building().complete() + + source = Device().create_graph_builder().begin_building() + launch(source, LaunchConfig(grid=1, block=1), empty_kernel) + source.end_building() + source.close() + + with pytest.raises(ValueError, match="^Source graph builder has been closed."): + graph.update(source) + + +def test_graph_gc_mid_capture(init_cuda): + """Dropping a builder mid-capture ends the orphaned capture so the stream stays usable.""" + import gc + + mod = compile_common_kernels() + empty_kernel = mod.get_kernel("empty_kernel") + + stream = Device().create_stream() + gb = stream.create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), empty_kernel) + + # Drop the builder without end_building()/close(); __dealloc__ must end the capture. + del gb + gc.collect() + + # If the capture were left active, the stream would be poisoned for new work. + launch(stream, LaunchConfig(grid=1, block=1), empty_kernel) + stream.sync() + stream.close() + + +def test_graph_embed_non_builder(init_cuda): + """embed() rejects a non-GraphBuilder argument with a TypeError.""" + gb = Device().create_graph_builder().begin_building() + with pytest.raises(TypeError): + gb.embed(object()) + gb.end_building() + + def test_graph_capture_callback_python(init_cuda): results = [] diff --git a/cuda_core/tests/graph/test_graph_builder_conditional.py b/cuda_core/tests/graph/test_graph_builder_conditional.py index de65848c1a0..6180c1927ef 100644 --- a/cuda_core/tests/graph/test_graph_builder_conditional.py +++ b/cuda_core/tests/graph/test_graph_builder_conditional.py @@ -289,3 +289,97 @@ def test_graph_conditional_while(init_cuda, condition_value): # Close the memory resource now because the garbage collected might # de-allocate it during the next graph builder process b.close() + + +@requires_module(np, "2.1") +def test_graph_conditional_on_forked_builder(init_cuda): + """A conditional created on a forked builder keeps its body graph's parent + handle pinned to the owning primary graph.""" + mod = compile_conditional_kernels(int) + add_one = mod.get_kernel("add_one") + set_handle = mod.get_kernel("set_handle") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = Device().create_graph_builder().begin_building() + launch(gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + + # Fork, then create the conditional on the forked builder (not the primary). + left, right = gb.split(2) + try: + condition = right.create_condition() + except RuntimeError as e: + with pytest.raises(RuntimeError, match="^(Driver|Binding) version"): + raise e + right.end_building() + GraphBuilder.join(left, right).end_building() + b.close() + pytest.skip("Driver does not support conditional handle") + launch(right, LaunchConfig(grid=1, block=1), set_handle, condition, 1) + gb_if = right.if_then(condition).begin_building() + launch(gb_if, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + gb_if.end_building() + + gb = GraphBuilder.join(left, right) + graph = gb.end_building().complete() + + arr[0] = 0 + graph.launch(launch_stream) + launch_stream.sync() + # add_one on primary (1) + add_one inside the taken if-branch (1) + assert arr[0] == 2 + + b.close() + + +@requires_module(np, "2.1") +def test_graph_conditional_nested(init_cuda): + """A conditional nested inside another conditional body exercises the + multi-level body -> outer-body -> primary keep-alive chain.""" + mod = compile_conditional_kernels(int) + add_one = mod.get_kernel("add_one") + set_handle = mod.get_kernel("set_handle") + + launch_stream = Device().create_stream() + mr = LegacyPinnedMemoryResource() + b = mr.allocate(4) + arr = np.from_dlpack(b).view(np.int32) + arr[0] = 0 + + gb = Device().create_graph_builder().begin_building() + + try: + outer_condition = gb.create_condition() + except RuntimeError as e: + with pytest.raises(RuntimeError, match="^(Driver|Binding) version"): + raise e + gb.end_building() + b.close() + pytest.skip("Driver does not support conditional handle") + launch(gb, LaunchConfig(grid=1, block=1), set_handle, outer_condition, 1) + + # Outer if-branch + gb_outer = gb.if_then(outer_condition).begin_building() + launch(gb_outer, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + + # Inner if-branch, created inside the outer body + inner_condition = gb_outer.create_condition() + launch(gb_outer, LaunchConfig(grid=1, block=1), set_handle, inner_condition, 1) + gb_inner = gb_outer.if_then(inner_condition).begin_building() + launch(gb_inner, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) + gb_inner.end_building() + gb_outer.end_building() + + graph = gb.end_building().complete() + + arr[0] = 0 + graph.launch(launch_stream) + launch_stream.sync() + # add_one in outer body (1) + add_one in inner body (1) + assert arr[0] == 2 + + b.close() From 1aec3e5397170ba0ad8aa8470250019182d8e2ba Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 24 Jun 2026 10:58:28 -0700 Subject: [PATCH 9/9] fix(cuda.core): resolve merge fallout in resource handle stubs Drop an unnecessary cimport from _resource_handles.pyx that broke stubgen/mypy after merging main, regenerate the affected .pyi files, and fix debug_dot_print path encoding for Cython 3.2. --- cuda_core/cuda/core/_resource_handles.pyi | 1 + cuda_core/cuda/core/_resource_handles.pyx | 17 ------ cuda_core/cuda/core/graph/_graph_builder.pyi | 55 +++++--------------- cuda_core/cuda/core/graph/_graph_builder.pyx | 2 +- 4 files changed, 16 insertions(+), 59 deletions(-) diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi index 490073c9fd1..8b2d9e75e18 100644 --- a/cuda_core/cuda/core/_resource_handles.pyi +++ b/cuda_core/cuda/core/_resource_handles.pyi @@ -13,6 +13,7 @@ DevicePtrHandle = shared_ptr LibraryHandle = shared_ptr KernelHandle = shared_ptr GraphHandle = shared_ptr +GraphExecHandle = shared_ptr GraphNodeHandle = shared_ptr GraphicsResourceHandle = shared_ptr NvrtcProgramHandle = shared_ptr diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 5f6f308475b..ba7d078a3fa 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -18,23 +18,6 @@ from cuda.bindings cimport cynvrtc from cuda.bindings cimport cynvvm from cuda.bindings cimport cynvjitlink -from ._resource_handles cimport ( - ContextHandle, - StreamHandle, - EventHandle, - MemoryPoolHandle, - DevicePtrHandle, - LibraryHandle, - KernelHandle, - GraphHandle, - GraphExecHandle, - GraphicsResourceHandle, - NvrtcProgramHandle, - NvvmProgramHandle, - NvJitLinkHandle, - CuLinkHandle, -) - import cuda.bindings.cydriver as cydriver import cuda.bindings.cynvrtc as cynvrtc import cuda.bindings.cynvvm as cynvvm diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi index 6dbca20e60b..af1748ad86c 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyi +++ b/cuda_core/cuda/core/graph/_graph_builder.pyi @@ -8,6 +8,8 @@ from cuda.core._stream import Stream from cuda.core._utils.cuda_utils import driver from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition +_BuilderKind = int +_CaptureState = int @dataclass class GraphDebugPrintOptions: @@ -106,23 +108,19 @@ class GraphBuilder: """ - class _MembersNeededForFinalize: - __slots__ = ('conditional_graph', 'graph', 'is_join_required', 'is_stream_owner', 'stream') - - def __init__(self, graph_builder_obj: GraphBuilder, stream_obj: Stream | None, is_stream_owner: bool, conditional_graph, is_join_required: bool) -> None: - ... - - def close(self) -> None: - ... - __slots__ = ('__weakref__', '_building_ended', '_mnff') + def __init__(self): + ... - def __init__(self) -> None: + def __dealloc__(self): ... - @classmethod - def _init(cls, stream: Stream | None, is_stream_owner: bool, conditional_graph: object=None, is_join_required: bool=False) -> GraphBuilder: + @staticmethod + def _init(stream: Stream): ... + def close(self): + """Destroy the graph builder.""" + @property def stream(self) -> Stream: """Returns the stream associated with the graph builder.""" @@ -155,7 +153,7 @@ class GraphBuilder: def end_building(self) -> GraphBuilder: """Ends the building process.""" - def complete(self, options: GraphCompleteOptions | None=None) -> 'Graph': + def complete(self, options: GraphCompleteOptions | None=None) -> Graph: """Completes the graph builder and returns the built :obj:`~graph.Graph` object. Parameters @@ -245,9 +243,6 @@ class GraphBuilder: A condition variable for controlling conditional execution. """ - def _cond_with_params(self, node_params: object) -> tuple[GraphBuilder, ...]: - ... - def if_then(self, condition: GraphCondition) -> GraphBuilder: """Adds an if condition branch and returns a new graph builder for it. @@ -335,15 +330,7 @@ class GraphBuilder: """ - def close(self) -> None: - """Destroy the graph builder. - - Closes the associated stream if we own it. Borrowed stream - object will instead have their references released. - - """ - - def embed(self, child: GraphBuilder) -> None: + def embed(self, child: GraphBuilder): """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node. Parameters @@ -392,21 +379,7 @@ class Graph: """ - class _MembersNeededForFinalize: - __slots__ = 'graph' - - def __init__(self, graph_obj: Graph, graph: driver.CUgraphExec) -> None: - ... - - def close(self) -> None: - ... - __slots__ = ('__weakref__', '_mnff') - - def __init__(self) -> None: - ... - - @classmethod - def _init(cls, graph: driver.CUgraphExec) -> Graph: + def __init__(self): ... def close(self) -> None: @@ -457,5 +430,5 @@ class Graph: """ __all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions'] -def _instantiate_graph(h_graph, options: GraphCompleteOptions | None=None) -> 'Graph': +def _instantiate_graph(h_graph, options: GraphCompleteOptions | None=None) -> Graph: ... \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index ada2445114a..cea4a38ba24 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -407,7 +407,7 @@ cdef class GraphBuilder: raise RuntimeError("Graph has not finished building.") cdef unsigned int c_flags = options._to_flags() if options else 0 cdef cydriver.CUgraph c_graph = as_cu(self._h_graph) - cdef bytes b_path = path.encode() if isinstance(path, str) else path + cdef bytes b_path = path.encode('utf-8') cdef const char* c_path = b_path with nogil: HANDLE_RETURN(cydriver.cuGraphDebugDotPrint(c_graph, c_path, c_flags))