From a5c9a0b21e49b0c2a1f0a4290311bac329fe1371 Mon Sep 17 00:00:00 2001
From: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
Date: Fri, 26 Jun 2026 22:37:21 +0000
Subject: [PATCH] Wrap timed custom_kernel launches in a cudaProfiler capture
 range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark timing loop in eval.py interleaves, per repeat, an L2-cache flush
and (in recheck/leaderboard modes) a cuSOLVER/cuBLAS reference correctness check
around the timed custom_kernel launches. Profiling eval.py directly therefore
captures all of that — warmup, the L2 flush, and the reference solver — not just
the submission's kernels, making nsys/ncu output hard to attribute.

Wrap only the timed `custom_kernel` launches in `torch.cuda.profiler.profile()`.
A profiler started with `nsys --capture-range=cudaProfilerApi` or
`ncu --profile-from-start off` then records exactly those launches and nothing
else. The context manager is a no-op when no profiler is attached, so test,
benchmark, and leaderboard runs and their reported timings are unchanged.

Applied to both linalg problems that ship a problem-local eval.py with this
timing loop: qr_v2 and eigh_py.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 problems/linalg/eigh_py/eval.py | 9 ++++++++-
 problems/linalg/qr_v2/eval.py   | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/problems/linalg/eigh_py/eval.py b/problems/linalg/eigh_py/eval.py
index c0dd353a2..fc21186e3 100644
--- a/problems/linalg/eigh_py/eval.py
+++ b/problems/linalg/eigh_py/eval.py
@@ -198,7 +198,14 @@ def _run_single_benchmark(
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        outputs = [custom_kernel(data) for data in data_list]
+        # The capture range a profiler narrows to with nsys
+        # --capture-range=cudaProfilerApi / ncu --profile-from-start off: it
+        # records only the timed custom_kernel launches, not the warmup,
+        # clear_l2_cache, or the reference checker below. A no-op when no
+        # profiler is attached, so normal test/benchmark/leaderboard runs and
+        # their timing are unaffected.
+        with torch.cuda.profiler.profile():
+            outputs = [custom_kernel(data) for data in data_list]
         end_event.record()
         torch.cuda.synchronize()
         durations.append(start_event.elapsed_time(end_event) * 1e6 / len(data_list))
diff --git a/problems/linalg/qr_v2/eval.py b/problems/linalg/qr_v2/eval.py
index cd2c6bd36..64db88ada 100644
--- a/problems/linalg/qr_v2/eval.py
+++ b/problems/linalg/qr_v2/eval.py
@@ -200,7 +200,14 @@ def _run_single_benchmark(
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        outputs = [custom_kernel(data) for data in data_list]
+        # The capture range a profiler narrows to with nsys
+        # --capture-range=cudaProfilerApi / ncu --profile-from-start off: it
+        # records only the timed custom_kernel launches, not the warmup,
+        # clear_l2_cache, or the reference checker below. A no-op when no
+        # profiler is attached, so normal test/benchmark/leaderboard runs and
+        # their timing are unaffected.
+        with torch.cuda.profiler.profile():
+            outputs = [custom_kernel(data) for data in data_list]
         end_event.record()
         torch.cuda.synchronize()
         durations.append(start_event.elapsed_time(end_event) * 1e6 / len(data_list))