From a5c9a0b21e49b0c2a1f0a4290311bac329fe1371 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach Date: Fri, 26 Jun 2026 22:37:21 +0000 Subject: [PATCH] Wrap timed custom_kernel launches in a cudaProfiler capture range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark timing loop in eval.py interleaves, per repeat, an L2-cache flush and (in recheck/leaderboard modes) a cuSOLVER/cuBLAS reference correctness check around the timed custom_kernel launches. Profiling eval.py directly therefore captures all of that — warmup, the L2 flush, and the reference solver — not just the submission's kernels, making nsys/ncu output hard to attribute. Wrap only the timed `custom_kernel` launches in `torch.cuda.profiler.profile()`. A profiler started with `nsys --capture-range=cudaProfilerApi` or `ncu --profile-from-start off` then records exactly those launches and nothing else. The context manager is a no-op when no profiler is attached, so test, benchmark, and leaderboard runs and their reported timings are unchanged. Applied to both linalg problems that ship a problem-local eval.py with this timing loop: qr_v2 and eigh_py. Co-Authored-By: Claude Opus 4.8 (1M context) --- problems/linalg/eigh_py/eval.py | 9 ++++++++- problems/linalg/qr_v2/eval.py | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/problems/linalg/eigh_py/eval.py b/problems/linalg/eigh_py/eval.py index c0dd353a2..fc21186e3 100644 --- a/problems/linalg/eigh_py/eval.py +++ b/problems/linalg/eigh_py/eval.py @@ -198,7 +198,14 @@ def _run_single_benchmark( start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() - outputs = [custom_kernel(data) for data in data_list] + # The capture range a profiler narrows to with nsys + # --capture-range=cudaProfilerApi / ncu --profile-from-start off: it + # records only the timed custom_kernel launches, not the warmup, + # clear_l2_cache, or the reference checker below. A no-op when no + # profiler is attached, so normal test/benchmark/leaderboard runs and + # their timing are unaffected. + with torch.cuda.profiler.profile(): + outputs = [custom_kernel(data) for data in data_list] end_event.record() torch.cuda.synchronize() durations.append(start_event.elapsed_time(end_event) * 1e6 / len(data_list)) diff --git a/problems/linalg/qr_v2/eval.py b/problems/linalg/qr_v2/eval.py index cd2c6bd36..64db88ada 100644 --- a/problems/linalg/qr_v2/eval.py +++ b/problems/linalg/qr_v2/eval.py @@ -200,7 +200,14 @@ def _run_single_benchmark( start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() - outputs = [custom_kernel(data) for data in data_list] + # The capture range a profiler narrows to with nsys + # --capture-range=cudaProfilerApi / ncu --profile-from-start off: it + # records only the timed custom_kernel launches, not the warmup, + # clear_l2_cache, or the reference checker below. A no-op when no + # profiler is attached, so normal test/benchmark/leaderboard runs and + # their timing are unaffected. + with torch.cuda.profiler.profile(): + outputs = [custom_kernel(data) for data in data_list] end_event.record() torch.cuda.synchronize() durations.append(start_event.elapsed_time(end_event) * 1e6 / len(data_list))