From e870a6bd985bf1c088ebf247d032126017f02207 Mon Sep 17 00:00:00 2001 From: Luca Giordano Date: Fri, 3 Jul 2026 18:11:29 +0200 Subject: [PATCH] Retry transient agent-turn failures in the sandbox runner sandbox.run() calls occasionally fail on transient network blips (e.g. "API Error: Server disconnected") rather than real task failures. Extract withRetry into a standalone retry.ts (avoids a circular import between main.ts and sandbox-runner.ts, which main.ts already imports from), add a shouldRetry predicate so callers can skip retrying errors that retrying can't fix (session limits, auth), and wrap both sandbox.run() call sites in sandbox-runner.ts with a new isTransientAgentError classifier. Co-Authored-By: Claude Sonnet 5 --- .sandcastle/main.ts | 42 ++--------- .sandcastle/package.json | 2 +- .sandcastle/reduce.test.ts | 74 +------------------- .sandcastle/retry.test.ts | 108 +++++++++++++++++++++++++++++ .sandcastle/retry.ts | 45 ++++++++++++ .sandcastle/sandbox-runner.test.ts | 30 +++++++- .sandcastle/sandbox-runner.ts | 65 +++++++++++++---- 7 files changed, 239 insertions(+), 127 deletions(-) create mode 100644 .sandcastle/retry.test.ts create mode 100644 .sandcastle/retry.ts diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 629b3b2..124e09d 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -36,6 +36,8 @@ import { SandboxRunner, SANDBOX_LABEL, PROJECT_LABEL_KEY, deriveProject } from " import { ReviewerAdapter } from "./reviewer-adapter.ts"; import { readMemoryContext, writeMemoryEntry } from "./memory-store.ts"; import { parseMemoryTags, buildOutcomeEntry, type OutcomeAttempt } from "./memory.ts"; +import { withRetry } from "./retry.ts"; +export { withRetry }; export { SANDBOX_LABEL }; @@ -190,43 +192,9 @@ export function parseConcurrency(): number { return Math.max(1, Number(process.env.AGENTIC_CONCURRENCY ?? "1") || 1); } -/** - * Retry `fn` up to `maxAttempts` times with exponential backoff. - * Throws the last error when all attempts are exhausted. - * `sleep` is injectable for unit tests (no real network or timers needed). - */ -export async function withRetry( - fn: () => Promise, - opts: { - maxAttempts?: number; - baseDelayMs?: number; - label?: string; - sleep?: (ms: number) => Promise; - } = {}, -): Promise { - const { - maxAttempts = 4, - baseDelayMs = 2_000, - label = "operation", - sleep = (ms) => new Promise((r) => setTimeout(r, ms)), - } = opts; - let lastErr: unknown; - for (let attempt = 1; attempt <= maxAttempts; attempt++) { - try { - return await fn(); - } catch (err) { - lastErr = err; - if (attempt < maxAttempts) { - const delay = baseDelayMs * 2 ** (attempt - 1); - console.warn( - `[retry] ${label} failed (attempt ${attempt}/${maxAttempts}), retrying in ${delay}ms`, - ); - await sleep(delay); - } - } - } - throw lastErr; -} +// withRetry lives in retry.ts (shared with sandbox-runner.ts, which main.ts +// itself imports — keeping it there avoids a circular import). Imported and +// re-exported at the top of this file for existing consumers. type ShellExec = (file: string, args: string[]) => Promise<{ stdout: string | Buffer }>; diff --git a/.sandcastle/package.json b/.sandcastle/package.json index 03007f8..c93da76 100644 --- a/.sandcastle/package.json +++ b/.sandcastle/package.json @@ -8,7 +8,7 @@ "postinstall": "patch-package", "start": "tsx main.ts", "typecheck": "tsc --noEmit", - "test": "node --import tsx --test reduce.test.ts sandbox-runner.test.ts reviewer-adapter.test.ts memory.test.ts memory-store.test.ts run-sh.test.ts init-sh.test.ts up-sh.test.ts afk-cmd.test.ts context-compressor.test.ts", + "test": "node --import tsx --test reduce.test.ts sandbox-runner.test.ts reviewer-adapter.test.ts memory.test.ts memory-store.test.ts run-sh.test.ts init-sh.test.ts up-sh.test.ts afk-cmd.test.ts context-compressor.test.ts retry.test.ts", "test:integration": "SANDCASTLE_INTEGRATION=1 node --import tsx --test integration.test.ts" }, "devDependencies": { diff --git a/.sandcastle/reduce.test.ts b/.sandcastle/reduce.test.ts index 711e49e..4bcca30 100644 --- a/.sandcastle/reduce.test.ts +++ b/.sandcastle/reduce.test.ts @@ -10,7 +10,7 @@ import { test } from "node:test"; import assert from "node:assert/strict"; import { reduce, READY_LABEL, IN_PROGRESS_LABEL, IN_REVIEW_LABEL, reviewVerdict, reconcileFromLabels, type State, type CiStatus, type Pr, type ReviewOutput } from "./reduce.ts"; import { parseBlockedBy, formatReviewAsComment } from "./issue-source.ts"; -import { sweepOrphanedSandboxes, ensureSandboxNetwork, parseConcurrency, withRetry, resetAgentBranch, refreshBase, validateSignature, classifyDelivery, parseSmeeEvent, parseOrchEnv, resolveCredentials, resolveRunMode, resolveDockerHost } from "./main.ts"; +import { sweepOrphanedSandboxes, ensureSandboxNetwork, parseConcurrency, resetAgentBranch, refreshBase, validateSignature, classifyDelivery, parseSmeeEvent, parseOrchEnv, resolveCredentials, resolveRunMode, resolveDockerHost } from "./main.ts"; import { createHmac } from "node:crypto"; import { SANDBOX_LABEL, PROJECT_LABEL_KEY, deriveProject } from "./sandbox-runner.ts"; @@ -638,78 +638,6 @@ test("parseConcurrency: invalid value falls back to 1", () => { } }); -// ─── withRetry ─────────────────────────────────────────────────────────────── - -test("withRetry: resolves immediately when fn succeeds on first attempt", async () => { - const sleeps: number[] = []; - const result = await withRetry(() => Promise.resolve(42), { - sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, - }); - assert.equal(result, 42); - assert.deepEqual(sleeps, [], "no sleep when fn succeeds first time"); -}); - -test("withRetry: retries on failure and resolves when fn eventually succeeds", async () => { - let calls = 0; - const sleeps: number[] = []; - const result = await withRetry( - () => { - calls++; - if (calls < 3) return Promise.reject(new Error("transient")); - return Promise.resolve("ok"); - }, - { - maxAttempts: 4, - baseDelayMs: 100, - sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, - }, - ); - assert.equal(result, "ok"); - assert.equal(calls, 3, "fn called exactly 3 times"); - assert.equal(sleeps.length, 2, "slept between each failed attempt"); -}); - -test("withRetry: throws last error after maxAttempts exhausted", async () => { - let calls = 0; - const err = new Error("persistent failure"); - await assert.rejects( - () => - withRetry(() => { calls++; return Promise.reject(err); }, { - maxAttempts: 3, - baseDelayMs: 10, - sleep: () => Promise.resolve(), - }), - (thrown: Error) => thrown === err, - ); - assert.equal(calls, 3, "fn tried exactly maxAttempts times"); -}); - -test("withRetry: sleeps with exponential backoff between attempts", async () => { - const sleeps: number[] = []; - await assert.rejects( - () => - withRetry(() => Promise.reject(new Error("fail")), { - maxAttempts: 4, - baseDelayMs: 50, - sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, - }), - ); - assert.deepEqual(sleeps, [50, 100, 200], "delays double each retry (2^0, 2^1, 2^2 * baseDelayMs)"); -}); - -test("withRetry: does not sleep after the final failed attempt", async () => { - const sleeps: number[] = []; - await assert.rejects( - () => - withRetry(() => Promise.reject(new Error("fail")), { - maxAttempts: 2, - baseDelayMs: 100, - sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, - }), - ); - assert.equal(sleeps.length, 1, "only sleeps between attempts, not after the last failure"); -}); - // ─── Conflicting PR (issue #23) ────────────────────────────────────────────── test("conflicting PR does not keep loop alive — Stop emitted when nothing else pending", () => { diff --git a/.sandcastle/retry.test.ts b/.sandcastle/retry.test.ts new file mode 100644 index 0000000..23e48f5 --- /dev/null +++ b/.sandcastle/retry.test.ts @@ -0,0 +1,108 @@ +/** + * Unit tests for withRetry — the generic backoff-retry wrapper used both by + * main.ts (transient gh/GitHub API failures) and sandbox-runner.ts (transient + * agent-turn failures, gated by isTransientAgentError). + * + * Run: npm test (picks up all *.test.ts files in the test script) + */ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { withRetry } from "./retry.ts"; + +test("withRetry: resolves immediately when fn succeeds on first attempt", async () => { + const sleeps: number[] = []; + const result = await withRetry(() => Promise.resolve(42), { + sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, + }); + assert.equal(result, 42); + assert.deepEqual(sleeps, [], "no sleep when fn succeeds first time"); +}); + +test("withRetry: retries on failure and resolves when fn eventually succeeds", async () => { + let calls = 0; + const sleeps: number[] = []; + const result = await withRetry( + () => { + calls++; + if (calls < 3) return Promise.reject(new Error("transient")); + return Promise.resolve("ok"); + }, + { + maxAttempts: 4, + baseDelayMs: 100, + sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, + }, + ); + assert.equal(result, "ok"); + assert.equal(calls, 3, "fn called exactly 3 times"); + assert.equal(sleeps.length, 2, "slept between each failed attempt"); +}); + +test("withRetry: throws last error after maxAttempts exhausted", async () => { + let calls = 0; + const err = new Error("persistent failure"); + await assert.rejects( + () => + withRetry(() => { calls++; return Promise.reject(err); }, { + maxAttempts: 3, + baseDelayMs: 10, + sleep: () => Promise.resolve(), + }), + (thrown: Error) => thrown === err, + ); + assert.equal(calls, 3, "fn tried exactly maxAttempts times"); +}); + +test("withRetry: sleeps with exponential backoff between attempts", async () => { + const sleeps: number[] = []; + await assert.rejects( + () => + withRetry(() => Promise.reject(new Error("fail")), { + maxAttempts: 4, + baseDelayMs: 50, + sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, + }), + ); + assert.deepEqual(sleeps, [50, 100, 200], "delays double each retry (2^0, 2^1, 2^2 * baseDelayMs)"); +}); + +test("withRetry: does not sleep after the final failed attempt", async () => { + const sleeps: number[] = []; + await assert.rejects( + () => + withRetry(() => Promise.reject(new Error("fail")), { + maxAttempts: 2, + baseDelayMs: 100, + sleep: (ms) => { sleeps.push(ms); return Promise.resolve(); }, + }), + ); + assert.equal(sleeps.length, 1, "only sleeps between attempts, not after the last failure"); +}); + +// ─── shouldRetry predicate ─────────────────────────────────────────────────── + +test("withRetry: shouldRetry=false throws immediately without further attempts", async () => { + let calls = 0; + const err = new Error("not worth retrying"); + await assert.rejects( + () => + withRetry(() => { calls++; return Promise.reject(err); }, { + maxAttempts: 4, + baseDelayMs: 10, + sleep: () => Promise.resolve(), + shouldRetry: () => false, + }), + (thrown: Error) => thrown === err, + ); + assert.equal(calls, 1, "fn called exactly once — no retries when shouldRetry is false"); +}); + +test("withRetry: shouldRetry defaults to true (retries on any error) when omitted", async () => { + let calls = 0; + const result = await withRetry( + () => { calls++; return calls < 2 ? Promise.reject(new Error("x")) : Promise.resolve("ok"); }, + { maxAttempts: 2, baseDelayMs: 10, sleep: () => Promise.resolve() }, + ); + assert.equal(result, "ok"); + assert.equal(calls, 2); +}); diff --git a/.sandcastle/retry.ts b/.sandcastle/retry.ts new file mode 100644 index 0000000..390917b --- /dev/null +++ b/.sandcastle/retry.ts @@ -0,0 +1,45 @@ +/** + * Retry `fn` up to `maxAttempts` times with exponential backoff. + * Throws the last error when all attempts are exhausted, or immediately + * (no further attempts) when `shouldRetry` returns false for a given error — + * e.g. a session-limit or auth failure that retrying won't fix, vs. a + * transient network blip that will likely clear on its own. + * `sleep` is injectable for unit tests (no real network or timers needed). + */ +export async function withRetry( + fn: () => Promise, + opts: { + maxAttempts?: number; + baseDelayMs?: number; + label?: string; + sleep?: (ms: number) => Promise; + shouldRetry?: (err: unknown) => boolean; + } = {}, +): Promise { + const { + maxAttempts = 4, + baseDelayMs = 2_000, + label = "operation", + sleep = (ms) => new Promise((r) => setTimeout(r, ms)), + shouldRetry = () => true, + } = opts; + let lastErr: unknown; + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + return await fn(); + } catch (err) { + lastErr = err; + if (!shouldRetry(err)) { + throw err; + } + if (attempt < maxAttempts) { + const delay = baseDelayMs * 2 ** (attempt - 1); + console.warn( + `[retry] ${label} failed (attempt ${attempt}/${maxAttempts}), retrying in ${delay}ms`, + ); + await sleep(delay); + } + } + } + throw lastErr; +} diff --git a/.sandcastle/sandbox-runner.test.ts b/.sandcastle/sandbox-runner.test.ts index aecdaa5..5c09c51 100644 --- a/.sandcastle/sandbox-runner.test.ts +++ b/.sandcastle/sandbox-runner.test.ts @@ -8,7 +8,7 @@ */ import { test } from "node:test"; import assert from "node:assert/strict"; -import { buildAgentInput } from "./sandbox-runner.ts"; +import { buildAgentInput, isTransientAgentError } from "./sandbox-runner.ts"; const STUB_ISSUE = { number: 42, title: "Fix the bug", body: "Detailed description" }; @@ -161,3 +161,31 @@ test("local tier is never proxy-routed regardless of HEADROOM_MODE", () => { else process.env.HEADROOM_MODE = orig; } }); + +// ─── isTransientAgentError ─────────────────────────────────────────────────── + +test("isTransientAgentError: true for a network/server disconnect", () => { + assert.equal(isTransientAgentError(new Error("API Error: Server disconnected")), true); +}); + +test("isTransientAgentError: true for common connection-reset style messages", () => { + assert.equal(isTransientAgentError(new Error("read ECONNRESET")), true); + assert.equal(isTransientAgentError(new Error("connect ETIMEDOUT 1.2.3.4:443")), true); + assert.equal(isTransientAgentError(new Error("socket hang up")), true); + assert.equal(isTransientAgentError(new Error("fetch failed")), true); +}); + +test("isTransientAgentError: false for a session-limit failure — retrying now can't help", () => { + assert.equal( + isTransientAgentError(new Error("You've hit your session limit · resets 12:20pm (UTC)")), + false, + ); +}); + +test("isTransientAgentError: false for an unrelated/logic error", () => { + assert.equal(isTransientAgentError(new Error("claude-code exited with code 1: some other reason")), false); +}); + +test("isTransientAgentError: false for a non-Error thrown value", () => { + assert.equal(isTransientAgentError("just a string"), false); +}); diff --git a/.sandcastle/sandbox-runner.ts b/.sandcastle/sandbox-runner.ts index 3ac8e92..d628506 100644 --- a/.sandcastle/sandbox-runner.ts +++ b/.sandcastle/sandbox-runner.ts @@ -29,11 +29,38 @@ import { dirname, join } from "node:path"; import { readFile } from "node:fs/promises"; import { getCompressionCallback, isCompressionActive } from "./context-compressor.js"; import { fileURLToPath } from "node:url"; +import { withRetry } from "./retry.ts"; const _dir = dirname(fileURLToPath(import.meta.url)); const COMPLETION_SIGNAL = "ISSUE_COMPLETE"; +/** + * Substrings of known-transient agent/network failures — worth one quick + * retry on the SAME sandbox (cheaper than the orchestrator's own #76 + * requeue, which throws away the whole sandbox and starts a fresh one). + * Deliberately excludes session-limit / rate-limit / credit-balance style + * messages: retrying those immediately can't help (they clear on their own + * schedule, not ours), so they're left to fail fast and fall through to the + * coarser requeue-on-a-later-tick retry instead of wasting an attempt now. + */ +const TRANSIENT_AGENT_ERROR_PATTERNS = [ + "server disconnected", + "econnreset", + "etimedout", + "socket hang up", + "fetch failed", + "network error", + "econnrefused", +]; + +/** Pure predicate: does this sandbox.run() failure look worth retrying immediately? */ +export function isTransientAgentError(err: unknown): boolean { + const message = err instanceof Error ? err.message : String(err); + const lower = message.toLowerCase(); + return TRANSIENT_AGENT_ERROR_PATTERNS.some((p) => lower.includes(p)); +} + /** * Docker label applied to every inner sandbox image (via Dockerfile LABEL). * Used for label-scoped container sweeps on startup and shutdown. @@ -254,22 +281,30 @@ export class SandboxRunner { } const compressed = await compression(text); - runResult = await sandbox.run({ - agent: agentInput.agent, - name: `issue-${issue.number}`, - maxIterations: this.opts.maxIterations ?? 12, - completionSignal: COMPLETION_SIGNAL, - prompt: compressed, - }); + runResult = await withRetry( + () => + sandbox.run({ + agent: agentInput.agent, + name: `issue-${issue.number}`, + maxIterations: this.opts.maxIterations ?? 12, + completionSignal: COMPLETION_SIGNAL, + prompt: compressed, + }), + { maxAttempts: 2, baseDelayMs: 5_000, label: `issue-${issue.number} agent turn`, shouldRetry: isTransientAgentError }, + ); } else { - runResult = await sandbox.run({ - agent: agentInput.agent, - name: `issue-${issue.number}`, - maxIterations: this.opts.maxIterations ?? 12, - completionSignal: COMPLETION_SIGNAL, - promptFile: agentInput.promptFile, - promptArgs: agentInput.promptArgs, - }); + runResult = await withRetry( + () => + sandbox.run({ + agent: agentInput.agent, + name: `issue-${issue.number}`, + maxIterations: this.opts.maxIterations ?? 12, + completionSignal: COMPLETION_SIGNAL, + promptFile: agentInput.promptFile, + promptArgs: agentInput.promptArgs, + }), + { maxAttempts: 2, baseDelayMs: 5_000, label: `issue-${issue.number} agent turn`, shouldRetry: isTransientAgentError }, + ); } const result = runResult;