From 758393a2347855f4c4086774a43d06fcfca639d7 Mon Sep 17 00:00:00 2001 From: Elizabeth Worstell Date: Wed, 24 Jun 2026 15:59:56 -0700 Subject: [PATCH] fix(snapshot): make snapshot serving range-aware The git snapshot endpoint ignored client Range headers, so it always served the full object. Forward Range/If-Range to the cache on the cache-hit path, return 206/Content-Range (416 when not satisfiable), and advertise Accept-Ranges, so clients can fetch with bounded parallel range requests (client.ParallelGet) instead of one full GET. Snapshot freshen metadata (X-Cachew-Snapshot-Commit / X-Cachew-Bundle-Url) is emitted on 206 responses too, via a shared setSnapshotMetadataHeaders helper, so ranged clients still learn whether to apply a delta bundle. Co-authored-by: Amp Amp-Thread-ID: https://ampcode.com/threads/T-019ef6fe-55c6-71e8-96dc-ca3ef4301d36 --- internal/httputil/conditional.go | 17 ++++ internal/strategy/git/snapshot.go | 104 ++++++++++++++++++------ internal/strategy/git/snapshot_test.go | 106 +++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 25 deletions(-) diff --git a/internal/httputil/conditional.go b/internal/httputil/conditional.go index e9bf42e..42c47a8 100644 --- a/internal/httputil/conditional.go +++ b/internal/httputil/conditional.go @@ -32,6 +32,23 @@ func ConditionalOptions(r *http.Request) []client.RequestOption { return opts } +// RangeOptions extracts only the Range/If-Range options from r, for callers +// that evaluate If-Match/If-None-Match separately (e.g. via CheckConditionals) +// and so must not double-handle them by forwarding the full set to Open. +// If-Range is only meaningful alongside Range, so it is dropped when Range is +// absent. +func RangeOptions(r *http.Request) []client.RequestOption { + v := r.Header.Get("Range") + if v == "" { + return nil + } + opts := []client.RequestOption{func(o *client.RequestOptions) { o.Range = v }} + if ir := r.Header.Get("If-Range"); ir != "" { + opts = append(opts, client.IfRange(ir)) + } + return opts +} + // CheckConditionals evaluates RFC 7232 If-Match and If-None-Match precondition // headers on r against etag. It returns 0 when all preconditions pass, // otherwise the HTTP status the caller should send: 412 Precondition Failed for diff --git a/internal/strategy/git/snapshot.go b/internal/strategy/git/snapshot.go index 7cd5c68..2901991 100644 --- a/internal/strategy/git/snapshot.go +++ b/internal/strategy/git/snapshot.go @@ -320,7 +320,24 @@ func (s *Strategy) handleSnapshotRequest(w http.ResponseWriter, r *http.Request, } s.maybeBackgroundFetch(repo) - reader, headers, err := s.cache.Open(ctx, cacheKey) + // Forward only Range/If-Range here; If-Match/If-None-Match are evaluated + // against the served body below via CheckConditionals. + reader, headers, err := s.cache.Open(ctx, cacheKey, httputil.RangeOptions(r)...) + if errors.Is(err, cache.ErrRangeNotSatisfiable) { + // A failed If-Match (412) or satisfied If-None-Match (304) takes + // precedence over an unsatisfiable range (RFC 7232 §3, RFC 7233 §3.1). + if status := httputil.CheckConditionals(r, headers.Get(cache.ETagKey)); status != 0 { + w.WriteHeader(status) + return + } + w.Header().Set("Content-Type", "application/zstd") + w.Header().Set("Accept-Ranges", "bytes") + if cr := headers.Get("Content-Range"); cr != "" { + w.Header().Set("Content-Range", cr) + } + w.WriteHeader(http.StatusRequestedRangeNotSatisfiable) + return + } if err != nil && !errors.Is(err, os.ErrNotExist) { logger.ErrorContext(ctx, "Failed to open snapshot from cache", "upstream", upstreamURL, "error", err) http.Error(w, "Internal server error", http.StatusInternalServerError) @@ -517,23 +534,42 @@ func (s *Strategy) handleBundleRequest(w http.ResponseWriter, r *http.Request, h } func (s *Strategy) serveSnapshotWithBundle(ctx context.Context, w http.ResponseWriter, r *http.Request, reader io.ReadCloser, headers http.Header, repo *gitclone.Repository, upstreamURL, repoName string, start time.Time) error { - snapshotCommit := headers.Get("X-Cachew-Snapshot-Commit") - mirrorHead := s.getMirrorHead(ctx, repo) - - // Forward the snapshot commit to the client so it knows whether the - // snapshot is fresh (no bundle URL = already at HEAD, skip freshen). - if snapshotCommit != "" { - w.Header().Set("X-Cachew-Snapshot-Commit", snapshotCommit) + // If-Match/If-None-Match are evaluated before serving any body: a satisfied + // If-None-Match (304) or a failed If-Match (412) takes precedence over a + // range response, so revalidating clients are not handed a 206 they would + // have to discard (RFC 7232 §3, RFC 7233 §3.1). + if status := httputil.CheckConditionals(r, headers.Get(cache.ETagKey)); status != 0 { + w.WriteHeader(status) + s.metrics.recordSnapshotServe(ctx, "cache", repoName, 0, time.Since(start)) + if span := trace.SpanFromContext(ctx); span.SpanContext().IsValid() { + span.SetAttributes(attribute.String("cachew.source", "cache"), attribute.Int64("cachew.bytes", 0)) + } + return nil } - if snapshotCommit != "" && mirrorHead != "" && snapshotCommit != mirrorHead { - repoPath, err := gitclone.RepoPathFromURL(upstreamURL) - if err == nil { - bundleURL := fmt.Sprintf("/git/%s/snapshot.bundle?base=%s", repoPath, snapshotCommit) - w.Header().Set("X-Cachew-Bundle-Url", bundleURL) + // A satisfied byte range is served as-is. Bundle negotiation applies only to + // whole-snapshot downloads, so a partial read (e.g. a client.ParallelGet + // chunk) skips it and returns 206 directly. + if cr := headers.Get("Content-Range"); cr != "" { + applySnapshotCacheHeaders(w, headers) + w.Header().Set("Accept-Ranges", "bytes") + w.Header().Set("Content-Range", cr) + // Ranged clients (client.ParallelGet) read the freshen metadata from the + // discovery chunk, so it must be present on partial responses too. + s.setSnapshotMetadataHeaders(ctx, w, headers, repo, upstreamURL) + w.WriteHeader(http.StatusPartialContent) + n, err := io.Copy(w, reader) + s.metrics.recordSnapshotServe(ctx, "cache_range", repoName, n, time.Since(start)) + if span := trace.SpanFromContext(ctx); span.SpanContext().IsValid() { + span.SetAttributes(attribute.String("cachew.source", "cache_range"), attribute.Int64("cachew.bytes", n)) } + return errors.Wrap(err, "stream snapshot range") + } + + snapshotCommit, bundleURL := s.setSnapshotMetadataHeaders(ctx, w, headers, repo, upstreamURL) - // Proactively generate and cache the bundle so any pod can serve it. + // Proactively generate and cache the advertised bundle so any pod can serve it. + if bundleURL != "" { go func() { bgCtx := context.WithoutCancel(ctx) logger := logging.FromContext(bgCtx) @@ -550,18 +586,9 @@ func (s *Strategy) serveSnapshotWithBundle(ctx context.Context, w http.ResponseW } applySnapshotCacheHeaders(w, headers) + w.Header().Set("Accept-Ranges", "bytes") - // Honour conditional GETs against the advertised ETag. ServeContent does this - // natively for *os.File readers, but cache backends returning non-file readers - // (S3, memory, remote) fall through to io.Copy, so revalidate explicitly to - // avoid streaming the full snapshot when the client already has it. - var n int64 - var err error - if status := httputil.CheckConditionals(r, headers.Get(cache.ETagKey)); status != 0 { - w.WriteHeader(status) - } else { - n, err = serveReaderFast(w, r, reader) - } + n, err := serveReaderFast(w, r, reader) s.metrics.recordSnapshotServe(ctx, "cache", repoName, n, time.Since(start)) if span := trace.SpanFromContext(ctx); span.SpanContext().IsValid() { span.SetAttributes(attribute.String("cachew.source", "cache"), attribute.Int64("cachew.bytes", n)) @@ -569,6 +596,33 @@ func (s *Strategy) serveSnapshotWithBundle(ctx context.Context, w http.ResponseW return errors.Wrap(err, "stream snapshot") } +// setSnapshotMetadataHeaders advertises the snapshot's commit and, when the +// snapshot trails the mirror's HEAD, the delta-bundle URL clients use to +// fast-forward. Shared by the full and ranged serve paths so ranged clients +// (client.ParallelGet) receive the same freshen metadata on the discovery +// chunk. It returns the snapshot commit and the bundle URL it set (empty when +// the snapshot is already at HEAD), so callers can decide whether to +// pre-generate the bundle. +func (s *Strategy) setSnapshotMetadataHeaders(ctx context.Context, w http.ResponseWriter, headers http.Header, repo *gitclone.Repository, upstreamURL string) (snapshotCommit, bundleURL string) { + snapshotCommit = headers.Get("X-Cachew-Snapshot-Commit") + if snapshotCommit == "" { + return "", "" + } + w.Header().Set("X-Cachew-Snapshot-Commit", snapshotCommit) + + mirrorHead := s.getMirrorHead(ctx, repo) + if mirrorHead == "" || snapshotCommit == mirrorHead { + return snapshotCommit, "" + } + repoPath, err := gitclone.RepoPathFromURL(upstreamURL) + if err != nil { + return snapshotCommit, "" + } + bundleURL = fmt.Sprintf("/git/%s/snapshot.bundle?base=%s", repoPath, snapshotCommit) + w.Header().Set("X-Cachew-Bundle-Url", bundleURL) + return snapshotCommit, bundleURL +} + // applySnapshotCacheHeaders forwards the cached snapshot's validators so clients // can revalidate (ETag) and size the transfer (Content-Length). Content-Type is // fixed for snapshots regardless of what the cache backend recorded. diff --git a/internal/strategy/git/snapshot_test.go b/internal/strategy/git/snapshot_test.go index 5977ea1..075aaaf 100644 --- a/internal/strategy/git/snapshot_test.go +++ b/internal/strategy/git/snapshot_test.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "testing" "time" @@ -930,3 +931,108 @@ func TestSnapshotRemoteURLUsesUpstreamURL(t *testing.T) { assert.NoError(t, err, string(output)) assert.Equal(t, upstreamURL+"\n", string(output)) } + +func TestSnapshotGetHonorsRange(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found in PATH") + } + + _, ctx := logging.Configure(context.Background(), logging.Config{}) + tmpDir := t.TempDir() + mirrorRoot := filepath.Join(tmpDir, "mirrors") + upstreamURL := "https://github.com/org/repo" + mirrorPath := filepath.Join(mirrorRoot, "github.com", "org", "repo") + createTestMirrorRepo(t, mirrorPath) + + // The memory cache returns a non-*os.File reader, so range handling must come + // from the strategy forwarding Range to Open rather than http.ServeContent. + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{MaxTTL: time.Hour}) + assert.NoError(t, err) + mux := newTestMux() + + cm := gitclone.NewManagerProvider(ctx, gitclone.Config{MirrorRoot: mirrorRoot}, nil) + s, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, cm, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + assert.NoError(t, err) + + manager, err := cm() + assert.NoError(t, err) + repo, err := manager.GetOrCreate(ctx, upstreamURL) + assert.NoError(t, err) + + waitForReady(t, s) + err = s.GenerateAndUploadSnapshot(ctx, repo) + assert.NoError(t, err) + + handler := mux.handlers["GET /git/{host}/{path...}"] + assert.NotZero(t, handler) + + get := func(rangeHeader string) *httptest.ResponseRecorder { + req := httptest.NewRequest(http.MethodGet, "/git/github.com/org/repo/snapshot.tar.zst", nil) + req = req.WithContext(ctx) + req.SetPathValue("host", "github.com") + req.SetPathValue("path", "org/repo/snapshot.tar.zst") + if rangeHeader != "" { + req.Header.Set("Range", rangeHeader) + } + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + return w + } + + // Full GET advertises range support and yields the whole body. + full := get("") + assert.Equal(t, 200, full.Code) + assert.Equal(t, "bytes", full.Header().Get("Accept-Ranges")) + body := full.Body.Bytes() + assert.True(t, len(body) > 4, "snapshot body should be larger than the test range") + commit := full.Header().Get("X-Cachew-Snapshot-Commit") + assert.NotZero(t, commit, "full GET should advertise the snapshot commit") + + // A satisfiable range returns 206 with the matching bytes and Content-Range. + partial := get("bytes=0-3") + assert.Equal(t, http.StatusPartialContent, partial.Code) + assert.Equal(t, "bytes", partial.Header().Get("Accept-Ranges")) + assert.Equal(t, "bytes 0-3/"+strconv.Itoa(len(body)), partial.Header().Get("Content-Range")) + assert.Equal(t, body[:4], partial.Body.Bytes()) + // Ranged clients must receive the same freshen metadata as a full GET so + // they can apply a delta bundle after a parallel download. + assert.Equal(t, commit, partial.Header().Get("X-Cachew-Snapshot-Commit")) + + // A range beyond the object is not satisfiable. + tooBig := get("bytes=" + strconv.Itoa(len(body)+10) + "-" + strconv.Itoa(len(body)+20)) + assert.Equal(t, http.StatusRequestedRangeNotSatisfiable, tooBig.Code) + assert.Equal(t, "bytes */"+strconv.Itoa(len(body)), tooBig.Header().Get("Content-Range")) + + etag := full.Header().Get("ETag") + assert.NotZero(t, etag, "snapshot should advertise an ETag") + + getCond := func(rangeHeader string, condHeaders map[string]string) *httptest.ResponseRecorder { + req := httptest.NewRequest(http.MethodGet, "/git/github.com/org/repo/snapshot.tar.zst", nil) + req = req.WithContext(ctx) + req.SetPathValue("host", "github.com") + req.SetPathValue("path", "org/repo/snapshot.tar.zst") + req.Header.Set("Range", rangeHeader) + for k, v := range condHeaders { + req.Header.Set(k, v) + } + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + return w + } + + // Conditional validators take precedence over the range: a matching + // If-None-Match revalidates to 304 and a stale If-Match fails with 412, + // rather than returning a 206 body the client would discard. + notModified := getCond("bytes=0-3", map[string]string{"If-None-Match": etag}) + assert.Equal(t, http.StatusNotModified, notModified.Code) + assert.Equal(t, 0, notModified.Body.Len()) + + preconditionFailed := getCond("bytes=0-3", map[string]string{"If-Match": `"stale-etag"`}) + assert.Equal(t, http.StatusPreconditionFailed, preconditionFailed.Code) + + // An unsatisfiable range with a stale If-Match is a precondition failure + // (412), not a 416. + beyond := strconv.Itoa(len(body)+10) + "-" + strconv.Itoa(len(body)+20) + rangeWithStaleIfMatch := getCond("bytes="+beyond, map[string]string{"If-Match": `"stale-etag"`}) + assert.Equal(t, http.StatusPreconditionFailed, rangeWithStaleIfMatch.Code) +}