From 4196dbf5e2cb8106dfcf29627a7e69541d30f9b1 Mon Sep 17 00:00:00 2001
From: chiro <chiro@local>
Date: Mon, 22 Jun 2026 19:30:38 +0000
Subject: [PATCH] eval: hand kernel a fresh input object each timed iteration
 (close result-replay vector in scored path)

---
 examples/eval.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/eval.py b/examples/eval.py
index 187e11cd3..810407064 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -256,6 +256,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     durations = []
     # generate input data once
     data = generate_input(**test.args)
+    bench_template = data  # kept as a template for fresh per-iteration copies (anti-replay)
     check_copy = _clone_data(data, 0)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
@@ -268,6 +269,14 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
+    # ANTI-REPLAY: even in the non-recheck (ranked/benchmark) path, hand the kernel a FRESH
+    # tensor object every timed iteration. Without this, the same `data` object is reused, so a
+    # submission can cache on the first call (keyed on object identity / .data_ptr() / ._version)
+    # and return the cached output with zero compute for all timed iterations -- the dominant
+    # "result replay" hack family. The clone happens OUTSIDE the timed region (before
+    # start_event.record()), so it does not inflate the measured kernel time. We retain the two
+    # most recent inputs so the caching allocator cannot immediately recycle a freed data_ptr.
+    _recent_inputs = []
     bm_start_time = time.perf_counter_ns()
     for i in range(max_repeats):
         if recheck:
@@ -277,6 +286,11 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
 
             data = generate_input(**test.args)
             check_copy = _clone_data(data, 0)
+        else:
+            data = _clone_data(bench_template, 0)
+            _recent_inputs.append(data)
+            if len(_recent_inputs) > 2:
+                _recent_inputs.pop(0)
         torch.cuda.synchronize()
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)