diff --git a/problems/linalg/eigh_py/eval.py b/problems/linalg/eigh_py/eval.py
index c0dd353a2..fc21186e3 100644
--- a/problems/linalg/eigh_py/eval.py
+++ b/problems/linalg/eigh_py/eval.py
@@ -198,7 +198,14 @@ def _run_single_benchmark(
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        outputs = [custom_kernel(data) for data in data_list]
+        # The capture range a profiler narrows to with nsys
+        # --capture-range=cudaProfilerApi / ncu --profile-from-start off: it
+        # records only the timed custom_kernel launches, not the warmup,
+        # clear_l2_cache, or the reference checker below. A no-op when no
+        # profiler is attached, so normal test/benchmark/leaderboard runs and
+        # their timing are unaffected.
+        with torch.cuda.profiler.profile():
+            outputs = [custom_kernel(data) for data in data_list]
         end_event.record()
         torch.cuda.synchronize()
         durations.append(start_event.elapsed_time(end_event) * 1e6 / len(data_list))
diff --git a/problems/linalg/qr_v2/eval.py b/problems/linalg/qr_v2/eval.py
index cd2c6bd36..64db88ada 100644
--- a/problems/linalg/qr_v2/eval.py
+++ b/problems/linalg/qr_v2/eval.py
@@ -200,7 +200,14 @@ def _run_single_benchmark(
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        outputs = [custom_kernel(data) for data in data_list]
+        # The capture range a profiler narrows to with nsys
+        # --capture-range=cudaProfilerApi / ncu --profile-from-start off: it
+        # records only the timed custom_kernel launches, not the warmup,
+        # clear_l2_cache, or the reference checker below. A no-op when no
+        # profiler is attached, so normal test/benchmark/leaderboard runs and
+        # their timing are unaffected.
+        with torch.cuda.profiler.profile():
+            outputs = [custom_kernel(data) for data in data_list]
         end_event.record()
         torch.cuda.synchronize()
         durations.append(start_event.elapsed_time(end_event) * 1e6 / len(data_list))