diff --git a/Cargo.lock b/Cargo.lock index ab20718..74f50ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2420,6 +2420,7 @@ dependencies = [ "base64", "clap", "dialoguer", + "flate2", "fs2", "hex", "indicatif", @@ -2431,6 +2432,7 @@ dependencies = [ "serial_test", "sha2", "socket-patch-core", + "tar", "tempfile", "testcontainers", "tokio", diff --git a/README.md b/README.md index 72809b5..19fec6a 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,10 @@ socket-patch scan -g # Scan + apply + emit an OpenVEX attestation in one pass socket-patch scan --json --sync --yes --vex socket.vex.json -# Vendor every patched dependency (committable; see the vendor command) +# Vendor every patched dependency (committable; see the vendor command). +# Works on a completely fresh clone: dependencies listed in the lockfile +# but not yet installed are fetched pristine from their registry and +# integrity-verified against the lockfile before vendoring. socket-patch scan --json --vendor --yes # Same, but keep the manifest out of it entirely diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index de9ef89..48103b0 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -75,6 +75,10 @@ Beyond the globals above, each subcommand defines a small set of local arguments `scan` queries the patch API in `--batch-size` chunks. Authenticated runs POST `/v0/orgs/{slug}/patches/batch`; token-less runs POST `{proxy}/patch/batch` on the public proxy and degrade to per-package `GET /patch/by-package/:purl` requests in two cases: the deployed proxy predates the batch endpoint (legacy proxies answer the POST with their `400 "Unsupported endpoint"` catch-all), or the all-or-nothing batch validation rejects the chunk (e.g. a crawled PURL type the server doesn't recognize, such as `pkg:jsr/…` — the per-package path tolerates those individually, preserving the pre-batch scan semantics). Rate limits and over-capacity 503s surface instead of silently degrading. +**Lockfile supplement (v3.4)**: `scan` discovery is no longer limited to installed trees. The project's lockfiles (`package-lock.json`/`npm-shrinkwrap.json`, `pnpm-lock.yaml` v9, `yarn.lock` classic + berry, `bun.lock`, `Cargo.lock`, `go.sum`, `composer.lock`, `Gemfile.lock`, `uv.lock`/`poetry.lock`/pinned `requirements.txt`) are inventoried and dependencies with NO installed copy join discovery — counts, the API lookup, the table (flagged ` [NOT INSTALLED]`, plus a stderr note), and the prune "scanned" set (a wiped node_modules no longer prunes lockfile-listed entries). JSON gains a top-level `lockfileOnlyPackages` count and an additive `notInstalled: true` on matching `packages[]` entries. `--apply` partitions lockfile-only patches out BEFORE download (calm `skipped`/`package_not_installed` records — never an error exit, never a manifest write); `--vendor` passes them through to the vendor engine's auto-fetch. Vendored-ledger entries likewise stay discoverable on a fresh clone (the committed artifact is the dependency). Global scans (`--global`) get no supplement. + +**Vendor auto-fetch (v3.4)**: `vendor`/`scan --vendor` no longer fail on lockfile-resolved packages with no installed copy. Already-vendored purls stage from their committed artifact (sha256-verified against the vendor ledger; offline-safe). Otherwise the pristine artifact is fetched per the lockfile resolution and verified against the lock's recorded integrity FAIL-CLOSED before any write: npm SRI (or yarn classic's sha1 fragment), yarn berry's cache-zip checksum (rebuilt from the fetched tarball; cacheKey 10c0 only), Cargo.lock sha256 over the .crate, go.sum `h1:` dirhash over the module zip, composer `dist.shasum` (sha1), Gemfile.lock `CHECKSUMS` sha256, uv.lock wheel sha256 (pure `py3-none-any` wheels only). Entries the lock cannot verify are NEVER fetched (`vendor_fetch_unverifiable` warning + the calm `package_not_installed` skip). Registry bases honor `SOCKET_NPM_REGISTRY`, `SOCKET_CRATES_REGISTRY`, `SOCKET_GOPROXY` (else `GOPROXY`); npm/yarn/composer/gem/uv lock-recorded URLs are used verbatim. `--offline` refuses the fetch with the calm skip (the detail names the lockfile resolution). The fetch stages into a private tempdir — the project tree is never touched. + `scan --sync` is sugar for `--apply --prune` — the canonical single-flag bot invocation. `scan --json --sync --yes` discovers, applies, and reconciles state in one pass. `scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". @@ -604,6 +608,9 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_override_conflict` | `failed` | vendor (pnpm/yarn-berry): a user-authored override/resolution for the package already exists. | | `vendor_integrity_unverified` | `skipped` (warning) | vendor (pipenv): the lockfile format does not hash-check file entries; the committed wheel bytes are the protection. | | `vendor_content_mismatch_overwritten` | `skipped` (warning) | vendor: a staged file matched NEITHER beforeHash nor afterHash (patch built against different bytes, or local edits); the stage was overwritten with the verified patched content and the vendor succeeded. | +| `vendor_fetched_missing` | `skipped` (warning) | vendor: the package was not installed; its pristine artifact was fetched per the lockfile resolution (or staged from the committed vendor artifact), integrity-verified, and vendored — the project tree was not touched. | +| `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a corrupt committed artifact). Suppresses the duplicate `package_not_installed` skip. | +| `vendor_fetch_unverifiable` | `skipped` (warning) | vendor: the lockfile records no usable integrity for the missing package; nothing was fetched (fail-closed) and the `package_not_installed` skip follows. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | | `vendor_*` / `pypi_*` / `gemfile_*` / `lock_*` / `locked_version_mismatch` / `user_authored_*` / `native_extensions_unsupported` / `platform_gem_unsupported` | `failed`/`skipped` | vendor: per-ecosystem refusal + drift vocabulary; see the Vendor command contract section. New tags are additive (MINOR). | diff --git a/crates/socket-patch-cli/Cargo.toml b/crates/socket-patch-cli/Cargo.toml index cedba95..e983539 100644 --- a/crates/socket-patch-cli/Cargo.toml +++ b/crates/socket-patch-cli/Cargo.toml @@ -59,6 +59,9 @@ setup-e2e = [] [dev-dependencies] sha2 = { workspace = true } +# scan_vendor_e2e builds pristine registry tarballs for the auto-fetch tests. +tar = { workspace = true } +flate2 = { workspace = true } hex = { workspace = true } wiremock = { workspace = true } portable-pty = { workspace = true } diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index e9ebd7f..06a97b0 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -281,6 +281,113 @@ pub(crate) fn detect_prunable( .collect() } +/// Lockfile-only packages: dependencies the project's lockfile resolves +/// that have no crawled (installed) counterpart. +#[derive(Default)] +struct LockfileSupplement { + packages: Vec, + /// Literal crawler-form purls, for fast membership tests. + purls: HashSet, + /// The lockfile the entries came from, for messages. + source: &'static str, +} + +/// Inventory the project's lockfile(s) and fabricate crawl entries for +/// dependencies that are not installed. The fabricated `path` is the +/// WOULD-BE install dir — every consumer degrades safely on a nonexistent +/// path (hash verify → NotFound, apply → partitioned skip, vendor → +/// auto-fetch). Global scans target the machine's global tree, not this +/// project's lockfile, so they get no supplement. +async fn lockfile_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> LockfileSupplement { + use socket_patch_core::patch::vendor::lock_inventory; + + let mut out = LockfileSupplement { + source: "project lockfiles", + ..Default::default() + }; + if common.global || common.global_prefix.is_some() { + return out; + } + let entries = lock_inventory::inventory_project(&common.cwd).await; + if entries.is_empty() { + return out; + } + let crawled_purls: HashSet<&str> = crawled.iter().map(|p| p.purl.as_str()).collect(); + for entry in entries { + if crawled_purls.contains(entry.purl.as_str()) { + continue; + } + let Some(pkg) = crawled_from_purl(&entry.purl, &common.cwd) else { + continue; + }; + out.purls.insert(entry.purl.clone()); + out.packages.push(pkg); + } + out +} + +/// A displayable crawl entry fabricated from a purl (decoded form). The +/// path is a placeholder consumers degrade safely on. +fn crawled_from_purl( + purl: &str, + cwd: &std::path::Path, +) -> Option { + let decoded = normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (_eco, rest) = rest.split_once('/')?; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name_part, version) = (&rest[..at], &rest[at + 1..]); + let (namespace, name) = match name_part.rsplit_once('/') { + Some((ns, n)) => (Some(ns.to_string()), n.to_string()), + None => (None, name_part.to_string()), + }; + Some(socket_patch_core::crawlers::types::CrawledPackage { + name, + version: version.to_string(), + namespace, + purl: decoded.clone(), + path: cwd.join("node_modules").join(name_part), + }) +} + +/// Vendored-ledger packages with no crawled counterpart: on a fresh clone +/// the committed artifact IS the dependency, so these stay discoverable +/// (updates[] detection, the table, and `scan --vendor` re-vendor/in-sync +/// runs all keep working before any install). They are NOT "lockfile-only" +/// — nothing needs installing; the artifact satisfies the lock. +async fn vendored_ledger_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> Vec { + if common.global || common.global_prefix.is_some() { + return Vec::new(); + } + let Ok(state) = socket_patch_core::patch::vendor::load_state(&common.cwd).await else { + return Vec::new(); + }; + let crawled_norm: HashSet = crawled + .iter() + .map(|p| normalize_purl(&p.purl).into_owned()) + .collect(); + let mut seen: HashSet = HashSet::new(); + let mut out = Vec::new(); + for entry in state.entries.values() { + let base = strip_purl_qualifiers(&entry.base_purl); + let norm = normalize_purl(base).into_owned(); + if crawled_norm.contains(&norm) || !seen.insert(norm) { + continue; + } + if let Some(pkg) = crawled_from_purl(base, &common.cwd) { + out.push(pkg); + } + } + out.sort_by(|a, b| a.purl.cmp(&b.purl)); + out +} + /// Vendor-mode pre-prompt check: uuids of selected patches whose installed /// files match NEITHER beforeHash nor afterHash — the patch was built /// against different bytes than the installed artifact. Vendoring still @@ -296,6 +403,7 @@ async fn preverify_vendor_baselines( org_slug: Option<&str>, selected: &[PatchSearchResult], crawled: &[socket_patch_core::crawlers::types::CrawledPackage], + lockfile_only: &HashSet, ) -> HashSet { use socket_patch_core::manifest::schema::PatchFileInfo; use socket_patch_core::patch::apply::{verify_file_patch, VerifyStatus}; @@ -306,6 +414,11 @@ async fn preverify_vendor_baselines( // API purls come percent-encoded, crawler purls literal — purl_eq // bridges the two spellings. let base = strip_purl_qualifiers(&patch.purl); + // Lockfile-only packages have no installed bytes to compare — the + // vendor engine fetches them pristine (nothing to annotate). + if lockfile_only.contains(normalize_purl(base).as_ref()) { + continue; + } let Some(pkg) = crawled.iter().find(|c| purl_eq(&c.purl, base)) else { continue; }; @@ -951,6 +1064,39 @@ fn partition_vendored_selected( (kept, vendored_records) } +/// Lockfile-only patches are skipped BEFORE download in apply mode: the +/// package is not on disk to patch in place, and downloading its patch +/// into the manifest would create a not-yet-appliable entry (and flip the +/// apply path's exit code). `scan --vendor` is the route that handles them +/// (the vendor engine auto-fetches lockfile-resolved packages). Matching +/// bridges API purl encoding via `normalize_purl`. Same shape/mechanics as +/// [`partition_vendored_selected`]. +fn partition_not_installed_selected( + selected: Vec, + lockfile_only: &HashSet, +) -> (Vec, Vec) { + if lockfile_only.is_empty() { + return (selected, Vec::new()); + } + let is_lockfile_only = |p: &str| { + lockfile_only.contains(normalize_purl(strip_purl_qualifiers(p)).as_ref()) + }; + let (not_installed, kept): (Vec<_>, Vec<_>) = selected + .into_iter() + .partition(|p| is_lockfile_only(&p.purl)); + let mut records: Vec = not_installed + .iter() + .map(|p| { + serde_json::json!({ + "purl": p.purl, "uuid": p.uuid, + "action": "skipped", "errorCode": "package_not_installed", + }) + }) + .collect(); + records.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, records) +} + /// Fold the pre-download vendored skips into the apply report returned by /// `download_and_apply_patches`: they were "found" by discovery and /// skipped here, never downloaded. Also strips the inner `status` (scan @@ -1184,7 +1330,28 @@ pub async fn run(args: ScanArgs) -> i32 { } // Crawl packages - let (all_crawled, eco_counts) = crawl_all_ecosystems(&crawler_options).await; + let (mut all_crawled, mut eco_counts) = crawl_all_ecosystems(&crawler_options).await; + + // Lockfile supplement: dependencies the project's lockfile resolves + // that have NO installed copy (fresh clone, partial install). They join + // discovery — counts, API lookup, table, the prune "scanned" set — and + // are flagged "not yet installed" everywhere a user could act on them. + let lockfile_only = lockfile_supplement(&args.common, &all_crawled).await; + if !lockfile_only.packages.is_empty() { + for pkg in &lockfile_only.packages { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(lockfile_only.packages.iter().cloned()); + } + let ledger_supplement = vendored_ledger_supplement(&args.common, &all_crawled).await; + for pkg in &ledger_supplement { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(ledger_supplement); // Every PURL the crawl found, captured BEFORE the `--ecosystems` // display/query filter is applied. Prune (below) must reference the @@ -1193,6 +1360,9 @@ pub async fn run(args: ScanArgs) -> i32 { // prune used the filtered set instead, `scan --ecosystems npm --prune` // would treat every cargo/go/pypi/gem manifest entry as "uninstalled" // and delete it (plus its blobs) — silent cross-ecosystem data loss. + // Lockfile-only purls are deliberately included: a dependency the + // lockfile still resolves must not be pruned just because node_modules + // is wiped or partially installed. let installed_purls: HashSet = all_crawled.iter().map(|p| p.purl.clone()).collect(); // Vendor-ledger purl keys, loaded once and shared by the prune @@ -1250,6 +1420,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, @@ -1310,6 +1481,13 @@ pub async fn run(args: ScanArgs) -> i32 { } else { eprintln!("Found {package_count} packages{eco_summary}"); } + if !lockfile_only.purls.is_empty() { + eprintln!( + "Note: {} package(s) from {} are not yet installed (lockfile-only).", + lockfile_only.purls.len(), + lockfile_only.source, + ); + } } // Query API in batches @@ -1495,6 +1673,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": package_count, + "lockfileOnlyPackages": lockfile_only.purls.len(), "packagesWithPatches": all_packages_with_patches.len(), "totalPatches": total_patches, "freePatches": free_patches, @@ -1507,6 +1686,19 @@ pub async fn run(args: ScanArgs) -> i32 { "newUuid": u.new_uuid, })).collect::>(), }); + // Flag lockfile-only packages so JSON consumers can tell "patch + // available but not installed" from the installed case. Additive + // field; absent means installed. + if let Some(packages) = result["packages"].as_array_mut() { + for pkg in packages { + let is_lockfile_only = pkg["purl"] + .as_str() + .is_some_and(|p| lockfile_only.purls.contains(p)); + if is_lockfile_only { + pkg["notInstalled"] = serde_json::json!(true); + } + } + } // `apply` and `prune` are computed once at the top of run() // (factoring in --sync, which implies both). They're independent @@ -1549,6 +1741,17 @@ pub async fn run(args: ScanArgs) -> i32 { // operator's signal to run `scan --vendor` (or `vendor`). let (selected, vendored_records) = partition_vendored_selected(selected, &vendored_purls); + // Lockfile-only purls leave the apply selection here (calm + // skip records, never an error); the union rides the same + // bookkeeping as the vendored skips. + let (selected, vendored_records) = { + let (kept, not_installed) = + partition_not_installed_selected(selected, &lockfile_only.purls); + let mut all = vendored_records; + all.extend(not_installed); + all.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, all) + }; let mut apply_code = 0i32; if dry { @@ -1791,14 +1994,22 @@ pub async fn run(args: ScanArgs) -> i32 { } else { String::new() }; + // Lockfile-only packages can be patched by `scan --vendor` + // (which fetches them pristine) but not applied in place. + let not_installed_marker = if lockfile_only.purls.contains(pkg.purl.as_str()) { + color(" [NOT INSTALLED]", "33", use_color) + } else { + String::new() + }; println!( - "{:<40} {:>8} {:<16} {}{}", + "{:<40} {:>8} {:<16} {}{}{}", display_purl, count_str, format_severity(severity, use_color), vuln_str, update_marker, + not_installed_marker, ); } @@ -1930,6 +2141,29 @@ pub async fn run(args: ScanArgs) -> i32 { } } + // Lockfile-only purls leave the in-place apply selection (calm skip, + // mirrors the JSON path). In `--vendor` mode they stay: the vendor + // engine fetches lockfile-resolved packages pristine. + let (selected, not_installed_selected): (Vec<_>, Vec) = if args.vendor { + (selected, Vec::new()) + } else { + let (kept, skipped) = partition_not_installed_selected(selected, &lockfile_only.purls); + let printed: Vec = skipped + .iter() + .filter_map(|r| r["purl"].as_str().map(str::to_string)) + .collect(); + (kept, printed) + }; + if !args.common.silent { + for purl in ¬_installed_selected { + println!( + " [skip] {} (not installed — run your package manager's install first, \ + or `scan --vendor` to vendor it from the lockfile)", + normalize_purl(purl) + ); + } + } + if selected.is_empty() && !args.vendor { if !args.common.silent { println!("No patches selected."); @@ -1946,6 +2180,7 @@ pub async fn run(args: ScanArgs) -> i32 { effective_org_slug, &selected, &filtered_crawled, + &lockfile_only.purls, ) .await } else { diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index d032439..0ca3037 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -525,13 +525,137 @@ pub(crate) async fn vendor_records( global_prefix: common.global_prefix.clone(), batch_size: 100, }; - let all_packages = find_packages_for_purls( + let mut all_packages = find_packages_for_purls( &vendorable_partition, &crawler_options, common.silent || common.json, ) .await; + // ── Auto-fetch: lockfile-resolved packages with no installed copy ──── + // A manifest patch whose package is not on disk but IS resolvable from + // the project's lockfile is fetched pristine from its registry (lock- + // recorded URL else the conventional one), verified against the lock's + // integrity FAIL-CLOSED, and staged from a private tempdir — the + // project tree is never touched, and the lock wiring works without an + // installed copy (it keys off lock entries). The holders keep the + // tempdirs alive until the dispatch loop below has staged from them. + let mut fetched_holders: Vec = + Vec::new(); + // Fetch failures must keep their distinct Failed event; this set + // suppresses the later duplicate `package_not_installed` skip. + let mut fetch_failed: HashSet = HashSet::new(); + { + use socket_patch_core::patch::vendor::{lock_inventory, registry_fetch}; + let missing: Vec = vendorable + .iter() + .filter(|p| !all_packages.contains_key(*p)) + .cloned() + .collect(); + if !missing.is_empty() { + // The inventory is a local file read — fine offline; only the + // fetch itself needs the network. + let inventory = lock_inventory::inventory_project(&common.cwd).await; + let client = registry_fetch::build_registry_client(); + // Pre-loaded vendor ledger for the artifact-staging path: an + // already-vendored purl with no installed copy (fresh clone) + // stages from its own committed artifact, sha256-verified + // against the ledger — offline-safe, no registry traffic. + let ledger = load_state(&common.cwd).await.unwrap_or_default(); + for purl in &missing { + if let Some(entry) = ledger + .entries + .get(purl) + .or_else(|| ledger.entries.values().find(|e| &e.base_purl == purl)) + .filter(|e| e.ecosystem == "npm" && e.artifact.path.ends_with(".tgz")) + { + let tgz = common.cwd.join(&entry.artifact.path); + match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256) + .await + { + Ok(staged) => { + all_packages.insert(purl.clone(), staged.dir().to_path_buf()); + fetched_holders.push(staged); + continue; + } + Err(registry_fetch::FetchError::Failed(detail)) => { + // A corrupt committed artifact is worth a loud + // failure — re-vendoring over it would mask the + // corruption. + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: {detail}", + normalize_purl(purl) + ); + } + continue; + } + Err(registry_fetch::FetchError::Unverifiable(_)) => { + // No recorded hash (legacy ledger) — fall + // through to the lockfile/registry path. + } + } + } + let Some(entry) = lock_inventory::lookup(&inventory, purl) else { + continue; // not lockfile-resolvable → package_not_installed + }; + if common.offline { + // The enriched skip detail lands below in the unmatched + // pass (the purl stays unmatched). + continue; + } + match registry_fetch::fetch_and_stage(entry, &client).await { + Ok(fetched) => { + record_warning( + env, + purl, + &VendorWarning::new( + "vendor_fetched_missing", + format!( + "{}@{} is not installed; fetched the pristine artifact \ + from {} (integrity verified against the lockfile) and \ + vendored from that copy — the project tree was not \ + touched", + entry.name, entry.version, fetched.url + ), + ), + common, + ); + all_packages.insert(purl.clone(), fetched.dir().to_path_buf()); + fetched_holders.push(fetched); + } + Err(registry_fetch::FetchError::Unverifiable(detail)) => { + record_warning( + env, + purl, + &VendorWarning::new("vendor_fetch_unverifiable", detail), + common, + ); + // Falls through to package_not_installed below. + } + Err(registry_fetch::FetchError::Failed(detail)) => { + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: fetch failed: {detail}", + normalize_purl(purl) + ); + } + } + } + } + } + } + let vendored_at = now_rfc3339(); let mut state = match load_state(&common.cwd).await { Ok(s) => s, @@ -763,10 +887,10 @@ pub(crate) async fn vendor_records( } // Manifest entries that targeted in-scope ecosystems but had no - // installed package on disk. + // installed package on disk (and could not be auto-fetched). let mut unmatched: Vec = vendorable .iter() - .filter(|p| !matched.contains(*p)) + .filter(|p| !matched.contains(*p) && !fetch_failed.contains(*p)) .cloned() .collect(); unmatched.sort(); @@ -776,15 +900,39 @@ pub(crate) async fn vendor_records( .map(|p| strip_purl_qualifiers(p).to_string()) .collect(); unmatched.retain(|p| !vendored_bases.contains(strip_purl_qualifiers(p))); + has_errors |= !fetch_failed.is_empty(); if !unmatched.is_empty() { has_errors = true; + // Offline runs name the packages the lockfile COULD have fetched — + // the inventory is a local file read, allowed offline. + let lock_resolvable: HashSet = if common.offline { + let entries = + socket_patch_core::patch::vendor::lock_inventory::inventory_project(&common.cwd) + .await; + unmatched + .iter() + .filter(|p| { + socket_patch_core::patch::vendor::lock_inventory::lookup(&entries, p) + .is_some() + }) + .cloned() + .collect() + } else { + HashSet::new() + }; for purl in &unmatched { + let detail = if lock_resolvable.contains(purl) { + "no installed package found; --offline prevents fetching it from the \ + registry (the lockfile resolves it)" + } else { + "no installed package found" + }; env.record( PatchEvent::new(PatchAction::Skipped, purl.clone()) - .with_reason("package_not_installed", "no installed package found"), + .with_reason("package_not_installed", detail), ); if !common.silent && !common.json { - eprintln!("Cannot vendor {}: package not installed", normalize_purl(purl)); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); } } } diff --git a/crates/socket-patch-cli/tests/cli_parse_scan.rs b/crates/socket-patch-cli/tests/cli_parse_scan.rs index 359994f..b961eb4 100644 --- a/crates/socket-patch-cli/tests/cli_parse_scan.rs +++ b/crates/socket-patch-cli/tests/cli_parse_scan.rs @@ -523,6 +523,7 @@ fn scan_json_empty_cwd_emits_updates_key() { let expected = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, diff --git a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs index 12e1b1a..28c118d 100644 --- a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs +++ b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs @@ -81,6 +81,10 @@ fn cargo(cwd: &Path, args: &[&str], cargo_home: &Path) -> Output { .args(args) .current_dir(cwd) .env("CARGO_HOME", cargo_home) + // The assertions read `/target/debug/...`; an ambient + // CARGO_TARGET_DIR (shared-build-cache setups) would redirect the + // child build elsewhere and break them. + .env_remove("CARGO_TARGET_DIR") .output() .expect("failed to run cargo") } diff --git a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs index 64f0b28..d3e8cb5 100644 --- a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs +++ b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs @@ -737,3 +737,408 @@ async fn scan_vendor_annotates_mismatched_baseline_and_vendors_anyway() { .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) .is_file()); } + +// ───────────── lockfile auto-fetch + scan lockfile supplement ───────────── + +/// sha512 SRI of the given bytes (what an npm-family lock records). +fn sri_of(bytes: &[u8]) -> String { + use base64::Engine as _; + use sha2::Sha512; + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) +} + +/// A pristine registry tarball for left-pad@1.3.0 whose index.js carries +/// the patch's BEFORE bytes. +fn pristine_tgz() -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0"}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, path, bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() +} + +/// Project fixture with a lockfile but NO node_modules: package.json + +/// package-lock.json whose left-pad entry resolves to `resolved_url` with +/// `integrity`. +fn write_lockfile_only_fixture(root: &Path, resolved_url: &str, integrity: &str) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0", "dependencies": { "left-pad": "^1.3.0" } }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "left-pad": "^1.3.0" } + }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": resolved_url, + "integrity": integrity, + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); +} + +/// Pre-seed `.socket/manifest.json` + the after-blob so a standalone +/// `vendor` run has local patch sources (no patch-API traffic). +fn seed_manifest_and_blob(root: &Path) { + let socket = root.join(".socket"); + std::fs::create_dir_all(socket.join("blobs")).unwrap(); + let manifest = serde_json::json!({ + "patches": { + PURL: { + "uuid": UUID, + "exportedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": git_sha256(BEFORE), + "afterHash": git_sha256(AFTER), + } + }, + "vulnerabilities": {}, + "description": "synthetic", + "license": "MIT", + "tier": "free" + } + } + }); + std::fs::write( + socket.join("manifest.json"), + serde_json::to_vec_pretty(&manifest).unwrap(), + ) + .unwrap(); + std::fs::write(socket.join("blobs").join(git_sha256(AFTER)), AFTER).unwrap(); +} + +async fn mount_registry_tarball(mock: &MockServer, tgz: Vec) { + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(mock) + .await; +} + +fn run_vendor(root: &Path, extra: &[&str]) -> (i32, serde_json::Value, String) { + let mut argv = vec!["vendor", "--json"]; + argv.extend_from_slice(extra); + let out = Command::new(binary()) + .args(&argv) + .current_dir(root) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run vendor"); + let stdout = String::from_utf8_lossy(&out.stdout).into_owned(); + let stderr = String::from_utf8_lossy(&out.stderr).into_owned(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()) + .unwrap_or_else(|e| panic!("vendor --json must emit JSON: {e}\n{stdout}\n{stderr}")); + (out.status.code().unwrap_or(-1), v, stderr) +} + +/// A manifest patch whose package is NOT installed but IS lockfile-resolved +/// is fetched pristine from the registry (integrity-verified against the +/// lock) and vendored — node_modules never appears. +#[tokio::test] +async fn vendor_auto_fetches_missing_package_from_lockfile() { + let mock = MockServer::start().await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_eq!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events.iter().any(|e| e["action"] == "applied" && e["purl"] == PURL), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetched_missing"), + "fetch surfaced as a warning event: {v:#}" + ); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!(lock.contains(&format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"))); + assert!( + !tmp.path().join("node_modules").exists(), + "the project tree is never touched" + ); +} + +/// Integrity mismatch between the lock and the served bytes is a distinct +/// vendor_fetch_failed failure — and nothing is written. +#[tokio::test] +async fn vendor_fetch_integrity_mismatch_is_vendor_fetch_failed() { + let mock = MockServer::start().await; + mount_registry_tarball(&mock, pristine_tgz()).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &sri_of(b"the lock expects different bytes"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["action"] == "failed" && e["errorCode"] == "vendor_fetch_failed"), + "{v:#}" + ); + assert!( + !events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "no duplicate not-installed skip: {v:#}" + ); + assert!(!tmp.path().join(".socket/vendor").exists()); +} + +/// --offline refuses the fetch with a calm package_not_installed skip that +/// names the lockfile as the would-be source. No HTTP traffic happens (no +/// registry route is mounted — a request would 404 and fail differently). +#[tokio::test] +async fn vendor_offline_refuses_fetch_with_calm_skip() { + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"irrelevant"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &["--offline"]); + assert_ne!(code, 0, "not-installed stays a non-benign skip: {v:#}"); + let events = v["events"].as_array().unwrap(); + let skip = events + .iter() + .find(|e| e["errorCode"] == "package_not_installed") + .unwrap_or_else(|| panic!("{v:#}")); + assert!( + skip["reason"] + .as_str() + .unwrap_or("") + .contains("--offline prevents fetching"), + "offline detail names the lockfile resolution: {v:#}" + ); +} + +/// An entry whose lock records no integrity is never fetched (fail-closed) +/// and keeps the plain not-installed outcome plus an explanatory warning. +#[tokio::test] +async fn vendor_fetch_unverifiable_lock_entry_stays_not_installed() { + let tmp = tempfile::tempdir().unwrap(); + // Hand-write a lock whose entry has no integrity field. + std::fs::write( + tmp.path().join("package.json"), + r#"{ "name": "x", "version": "0.0.0" }"#, + ) + .unwrap(); + std::fs::write( + tmp.path().join("package-lock.json"), + serde_json::to_vec_pretty(&serde_json::json!({ + "name": "x", "version": "0.0.0", "lockfileVersion": 3, + "packages": { + "": { "name": "x", "version": "0.0.0" }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz" + } + } + })) + .unwrap(), + ) + .unwrap(); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetch_unverifiable"), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "{v:#}" + ); +} + +/// The headline flow: a COMPLETELY fresh clone (lockfile, no node_modules, +/// no .socket) discovers from the lockfile and `scan --vendor` vendors +/// end-to-end via the registry fetch. +#[tokio::test] +async fn scan_vendor_works_on_a_completely_fresh_clone() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["vendor"]["summary"]["applied"], 1, "{v}"); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + assert!(!tmp.path().join("node_modules").exists()); + + // Second run: in sync. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + let events = v["vendor"]["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "already_vendored"), + "{v}" + ); +} + +/// Read-only discovery flags lockfile-only packages in JSON and the human +/// table. +#[tokio::test] +async fn scan_discovers_lockfile_only_packages_with_warning() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused for discovery"), + ); + + // JSON shape. + let out = Command::new(binary()) + .args([ + "scan", "--json", "--api-url", &mock.uri(), "--api-token", "fake-token", + "--org", ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["scannedPackages"], 1, "{v}"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["packages"][0]["notInstalled"], true, "{v}"); + + // Human output: the table marker + the note. + let out = Command::new(binary()) + .args([ + "scan", "--api-url", &mock.uri(), "--api-token", "fake-token", + "--org", ORG_SLUG, "--dry-run", "--yes", + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stdout.contains("[NOT INSTALLED]"), + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stderr.contains("not yet installed (lockfile-only)"), + "stderr={stderr}" + ); +} + +/// `scan --apply` skips lockfile-only patches calmly: exit 0, a skipped +/// record with package_not_installed, and NO manifest entry written. +#[tokio::test] +async fn scan_apply_skips_lockfile_only_without_error() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused"), + ); + + let out = Command::new(binary()) + .args([ + "scan", "--json", "--apply", "--yes", "--api-url", &mock.uri(), + "--api-token", "fake-token", "--org", ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "lockfile-only must not flip the exit code: {v}"); + assert_eq!(v["status"], "success", "{v}"); + let patches = v["apply"]["patches"].as_array().unwrap(); + assert!( + patches.iter().any(|p| p["action"] == "skipped" + && p["errorCode"] == "package_not_installed"), + "{v}" + ); + assert!( + !tmp.path().join(".socket/manifest.json").exists(), + "no manifest entry is written for a not-installed package" + ); +} diff --git a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs index f1b3c51..f35a2c2 100644 --- a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs @@ -423,7 +423,7 @@ fn revert_one_record( // ───────────────────────── conservative line grammar ────────────────────── /// One parsed single-line packages entry. -struct BunEntry { +pub(super) struct BunEntry { line_idx: usize, /// Leading whitespace, re-emitted verbatim. indent: String, @@ -432,7 +432,7 @@ struct BunEntry { /// The key token exactly as spelled (incl. quotes), re-emitted verbatim. key_raw: String, /// Verbatim top-level tuple elements (trimmed). - elems: Vec, + pub(super) elems: Vec, trailing_comma: bool, } @@ -472,14 +472,14 @@ fn classify(entry: &BunEntry, target_spec: &str, name: &str) -> Option Option<(&str, &str)> { +pub(super) fn split_name_spec(s: &str) -> Option<(&str, &str)> { let at = s.rfind('@').filter(|&i| i > 0)?; Some((&s[..at], &s[at + 1..])) } /// `"lockfileVersion": ` head check — only the fixture-pinned text /// lockfile version is spliced (fail-closed on anything newer/older). -fn check_lock_version(text: &str) -> Result<(), String> { +pub(super) fn check_lock_version(text: &str) -> Result<(), String> { let version = text.lines().take(5).find_map(|line| { line.trim() .strip_prefix("\"lockfileVersion\":") @@ -514,7 +514,7 @@ fn packages_bounds(lines: &[String]) -> Option<(usize, usize)> { /// Strictly parse every entry line of the packages section. Any line that /// is neither blank nor a single-line `"key": [tuple]` entry fails CLOSED. -fn parse_packages_section(lines: &[String]) -> Result, String> { +pub(super) fn parse_packages_section(lines: &[String]) -> Result, String> { let Some((start, end)) = packages_bounds(lines) else { // No (or unterminated) packages section: an empty lock simply has // no entries; an unterminated one is malformed. @@ -651,7 +651,7 @@ fn split_top_level(interior: &str) -> Result, String> { } /// Decode a verbatim JSON string token; `None` if it is not one. -fn decode_json_string(token: &str) -> Option { +pub(super) fn decode_json_string(token: &str) -> Option { if !token.starts_with('"') { return None; } diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs new file mode 100644 index 0000000..476353c --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -0,0 +1,1576 @@ +//! Read-only lockfile inventories: the dependency set a project's lockfile +//! resolves, independent of what is installed on disk. +//! +//! Two consumers: +//! +//! * `scan` supplements its installed-tree crawl with lockfile-only entries +//! (discovery on fresh clones and partial installs), warning that those +//! packages are not yet installed; +//! * `vendor` fetches the pristine artifact for a lockfile-resolved package +//! with no installed copy ([`super::registry_fetch`]), verifying the bytes +//! against the integrity the lock records — FAIL-CLOSED: an entry whose +//! lock carries no content verifier is never fetched. +//! +//! Parsing is fail-soft per entry (a malformed entry is skipped, never an +//! error; a malformed file yields `None`) and fail-closed per value: +//! names/versions are path-safety-guarded before an entry is emitted — the +//! lockfile is committed, tamperable input that later feeds filesystem paths +//! and download URLs. + +use std::collections::HashMap; +use std::path::Path; + +use serde_json::Value; + +use crate::patch::path_safety; +use crate::utils::purl::strip_purl_qualifiers; + +use super::npm_common::is_safe_npm_name; +use super::npm_flavor::{detect_npm_lock_flavor, NpmLockFlavor}; +use super::path::parse_vendor_path; +use super::{bun_lock, pnpm_lock, yarn_berry_lock, yarn_classic_lock}; + +/// The content verifier a lockfile records for an entry. The fetch layer +/// refuses entries whose verifier is [`LockIntegrity::None`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LockIntegrity { + /// SRI string (`sha512-`, possibly multi-hash space-separated) — + /// npm family; verified against the raw tarball bytes. + Sri(String), + /// yarn classic `resolved "...#"` fragment (40-hex) — verified + /// against the raw tarball bytes. + Sha1Hex(String), + /// yarn berry cache-zip checksum (`/`, e.g. `10c0/…`) — + /// verified by rebuilding the deterministic cache zip from the fetched + /// tarball and comparing (the lock never hashes the tarball itself). + BerryChecksum(String), + /// Hex sha256 of the artifact (Cargo.lock `checksum`, pypi file hashes, + /// Gemfile.lock `CHECKSUMS`). + Sha256Hex(String), + /// go.sum module-zip dirhash (`h1:`). + GoH1(String), + /// The lock records no content verifier. + None, +} + +/// One lockfile-resolved package. +#[derive(Debug, Clone)] +pub struct LockfileEntry { + /// Vendor-ecosystem tag (`npm`, `cargo`, `golang`, `pypi`, `gem`, + /// `composer`) — matches `VendorEntry::ecosystem`. + pub ecosystem: &'static str, + /// Literal (percent-decoded) package name, e.g. `@scope/name`. + pub name: String, + /// Exact resolved version. + pub version: String, + /// Canonical literal purl (`pkg:npm/@scope/name@1.0.0`) — the same form + /// the crawlers emit. + pub purl: String, + /// Artifact URL when the lock records one (package-lock `resolved`, + /// yarn `resolved` minus its `#sha1` fragment, pnpm `tarball:`); `None` + /// means the fetcher constructs the conventional registry URL. + pub resolved: Option, + pub integrity: LockIntegrity, +} + +impl LockfileEntry { + fn npm( + name: impl Into, + version: impl Into, + resolved: Option, + integrity: LockIntegrity, + ) -> Self { + let (name, version) = (name.into(), version.into()); + let purl = format!("pkg:npm/{name}@{version}"); + LockfileEntry { + ecosystem: "npm", + name, + version, + purl, + resolved, + integrity, + } + } +} + +/// Inventory the project's npm-family lockfile. Routes by +/// [`detect_npm_lock_flavor`] (PnP markers, bun.lockb, unsupported lock +/// versions, and a missing lockfile all yield `None`). +pub async fn inventory_npm_lock( + project_root: &Path, +) -> Option<(NpmLockFlavor, Vec)> { + let (flavor, _warnings) = detect_npm_lock_flavor(project_root).await.ok()?; + let raw = match flavor { + NpmLockFlavor::PackageLock => inventory_package_lock(project_root).await, + NpmLockFlavor::Pnpm => inventory_pnpm_lock(project_root).await, + NpmLockFlavor::YarnClassic => inventory_yarn_classic(project_root).await, + NpmLockFlavor::YarnBerry => inventory_yarn_berry(project_root).await, + NpmLockFlavor::Bun => inventory_bun(project_root).await, + }?; + Some((flavor, finalize_npm(raw))) +} + +/// Match a manifest/API purl (possibly percent-encoded, possibly carrying +/// qualifiers) against the inventory: components decode via +/// [`crate::utils::purl::normalize_purl`], so `pkg:npm/%40scope/x@1` +/// matches the literal entry. +pub fn lookup<'a>(entries: &'a [LockfileEntry], purl: &str) -> Option<&'a LockfileEntry> { + let decoded = crate::utils::purl::normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (purl_type, rest) = rest.split_once('/')?; + // purl type → vendor-ecosystem tag (same mapping the dispatcher uses). + let eco = match purl_type { + "npm" => "npm", + "cargo" => "cargo", + "golang" => "golang", + "pypi" => "pypi", + "gem" => "gem", + "composer" => "composer", + _ => return None, + }; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name, version) = (&rest[..at], &rest[at + 1..]); + // pypi names compare in PEP 503 normalized form. + let name = if eco == "pypi" { + pep503(name) + } else { + name.to_string() + }; + entries + .iter() + .find(|e| e.ecosystem == eco && e.name == name && e.version == version) +} + +/// Everything every recognized lockfile in the project resolves — the +/// union the scan supplement and the vendor auto-fetch consume. +pub async fn inventory_project(project_root: &Path) -> Vec { + let mut out: Vec = Vec::new(); + if let Some((_, entries)) = inventory_npm_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "cargo")] + if let Some(entries) = inventory_cargo_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "golang")] + if let Some(entries) = inventory_go_sum(project_root).await { + out.extend(entries); + } + #[cfg(feature = "composer")] + if let Some(entries) = inventory_composer_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_gemfile_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_pypi_locks(project_root).await { + out.extend(entries); + } + out +} + +/// Guard + dedup the raw npm entries: unsafe names/versions are dropped +/// fail-closed; duplicate (name, version) instances collapse to one, +/// preferring the instance that carries a verifier. +fn finalize_npm(raw: Vec) -> Vec { + dedup_prefer_integrity( + raw.into_iter() + .filter(|e| { + is_safe_npm_name(&e.name) && path_safety::is_safe_single_segment(&e.version) + }) + .collect(), + ) +} + +/// Collapse duplicate (name, version) instances, preferring one that +/// carries a verifier. +fn dedup_prefer_integrity(raw: Vec) -> Vec { + let mut seen: HashMap<(String, String), usize> = HashMap::new(); + let mut out: Vec = Vec::new(); + for entry in raw { + let key = (entry.name.clone(), entry.version.clone()); + match seen.get(&key) { + Some(&i) => { + if out[i].integrity == LockIntegrity::None + && entry.integrity != LockIntegrity::None + { + out[i] = entry; + } + } + None => { + seen.insert(key, out.len()); + out.push(entry); + } + } + } + out +} + +// ──────────────────────────────── Cargo.lock ──────────────────────────────── + +/// Inventory `Cargo.lock` `[[package]]` blocks. Only crates.io-sourced +/// entries are fetchable (their `checksum` is the sha256 of the `.crate` +/// file); workspace members (no `source`) are skipped, and git/custom- +/// registry sources stay listed for discovery without a verifier. +#[cfg(feature = "cargo")] +pub async fn inventory_cargo_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Cargo.lock")) + .await + .ok()?; + /// One in-flight `[[package]]` block: name, version, source, checksum. + type CargoBlock = (Option, Option, Option, Option); + let mut out = Vec::new(); + let mut cur: Option = None; + let flush = |cur: &mut Option, out: &mut Vec| { + if let Some((Some(name), Some(version), source, checksum)) = cur.take() { + let Some(source) = source else { + return; // workspace member + }; + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + return; + } + let crates_io = source.contains("github.com/rust-lang/crates.io-index") + || source.contains("index.crates.io"); + let integrity = match checksum { + Some(c) if crates_io && c.len() == 64 && c.bytes().all(|b| b.is_ascii_hexdigit()) => { + LockIntegrity::Sha256Hex(c) + } + _ => LockIntegrity::None, + }; + let purl = format!("pkg:cargo/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "cargo", + name, + version, + purl, + resolved: None, + integrity, + }); + } + }; + for line in text.lines() { + let line = line.trim(); + if line == "[[package]]" { + flush(&mut cur, &mut out); + cur = Some((None, None, None, None)); + continue; + } + if line.starts_with('[') { + flush(&mut cur, &mut out); + continue; + } + let Some(slot) = cur.as_mut() else { continue }; + let Some((key, value)) = line.split_once('=') else { + continue; + }; + let value = value.trim().trim_matches('"').to_string(); + match key.trim() { + "name" => slot.0 = Some(value), + "version" => slot.1 = Some(value), + "source" => slot.2 = Some(value), + "checksum" => slot.3 = Some(value), + _ => {} + } + } + flush(&mut cur, &mut out); + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────────── go.sum ────────────────────────────────── + +/// Inventory `go.sum` module-zip lines (` h1:`); the +/// `/go.mod`-suffixed lines hash only the manifest and are skipped. go.sum +/// may list more modules than the final build graph — acceptable for +/// discovery, and the manifest decides what actually gets vendored. +#[cfg(feature = "golang")] +pub async fn inventory_go_sum(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("go.sum")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let mut parts = line.split_whitespace(); + let (Some(module), Some(version), Some(hash)) = + (parts.next(), parts.next(), parts.next()) + else { + continue; + }; + if version.ends_with("/go.mod") || !hash.starts_with("h1:") { + continue; + } + // SECURITY: module path segments and the version feed paths/URLs. + if !path_safety::is_safe_multi_segment(module) + || !path_safety::is_safe_single_segment(version) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "golang", + name: module.to_string(), + version: version.to_string(), + purl: format!("pkg:golang/{module}@{version}"), + resolved: None, + integrity: LockIntegrity::GoH1(hash.to_string()), + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// Keep a lock-recorded URL only when it is a plain http(s) artifact URL +/// (drops `git+…`, `file:…`, `link:…` — content the registry conventions +/// cannot reproduce; such entries stay listed for discovery but the fetch +/// layer's integrity rule decides fetchability). +fn http_url(raw: &str) -> Option { + (raw.starts_with("https://") || raw.starts_with("http://")).then(|| raw.to_string()) +} + +// ──────────────────── package-lock.json / npm-shrinkwrap ──────────────────── + +async fn inventory_package_lock(root: &Path) -> Option> { + // Shrinkwrap wins, mirroring `npm_lock::select_lockfile`. + let mut bytes = None; + for lock in ["npm-shrinkwrap.json", "package-lock.json"] { + if let Ok(b) = tokio::fs::read(root.join(lock)).await { + bytes = Some(b); + break; + } + } + let doc: Value = serde_json::from_slice(&bytes?).ok()?; + // v1 legacy locks have no `packages` map — no inventory (documented). + let packages = doc.get("packages")?.as_object()?; + + let mut out = Vec::new(); + for (key, node) in packages { + // "" is the root project; keys without node_modules/ are workspace + // members (mirrors npm_lock::scan_lock_matches' member rule). + let Some((_, key_name)) = key.rsplit_once("node_modules/") else { + continue; + }; + if node.get("link").and_then(Value::as_bool).unwrap_or(false) + || node.get("inBundle").and_then(Value::as_bool).unwrap_or(false) + { + continue; + } + let name = node + .get("name") + .and_then(Value::as_str) + .unwrap_or(key_name) + .to_string(); + let Some(version) = node.get("version").and_then(Value::as_str) else { + continue; + }; + let resolved_raw = node.get("resolved").and_then(Value::as_str); + // Our own vendored spec: not a registry dependency. + if resolved_raw.is_some_and(|r| parse_vendor_path(r).is_some()) { + continue; + } + let integrity = node + .get("integrity") + .and_then(Value::as_str) + .map(|i| LockIntegrity::Sri(i.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm( + name, + version, + resolved_raw.and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ─────────────────────────── pnpm-lock.yaml v9 ─────────────────────────── + +/// Extract one value from an inline YAML map fragment like +/// `{integrity: sha512-…, tarball: file:…}` (values optionally quoted). +fn inline_map_value(fragment: &str, field: &str) -> Option { + let at = fragment.find(&format!("{field}:"))?; + let rest = fragment[at + field.len() + 1..].trim_start(); + let end = rest.find([',', '}']).unwrap_or(rest.len()); + let value = rest[..end].trim().trim_matches(['\'', '"']); + (!value.is_empty()).then(|| value.to_string()) +} + +async fn inventory_pnpm_lock(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("pnpm-lock.yaml")) + .await + .ok()?; + let lines = pnpm_lock::split_lines(&text); + let (start, end) = pnpm_lock::section_bounds(&lines, "packages")?; + + let mut out = Vec::new(); + let mut i = start + 1; + while let Some(block) = pnpm_lock::next_block(&lines, i, end) { + i = block.end; + // Key grammar: `name@version` (name may be `@scope/name`), with + // optional peer-dep suffixes `(peer@1.2.3)…` after the version. + let base = match block.key.find('(') { + Some(p) => block.key[..p].trim_end(), + None => block.key.as_str(), + }; + let Some(at) = base.rfind('@').filter(|&p| p > 0) else { + continue; + }; + let (name, version) = (&base[..at], &base[at + 1..]); + // Only plain registry versions: `file:`/`link:`/`https:`/git specs + // are not registry-resolvable. + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let mut integrity = LockIntegrity::None; + let mut tarball: Option = None; + for line in &lines[block.header + 1..block.end] { + let t = line.trim(); + if let Some(rest) = t.strip_prefix("resolution:") { + if let Some(v) = inline_map_value(rest, "integrity") { + integrity = LockIntegrity::Sri(v); + } + tarball = inline_map_value(rest, "tarball"); + break; + } + } + // Our own vendored spec: not a registry dependency. + if tarball.as_deref().is_some_and(|t| parse_vendor_path(t).is_some()) { + continue; + } + out.push(LockfileEntry::npm( + name, + version, + tarball.as_deref().and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (classic) ───────────────────────────── + +async fn inventory_yarn_classic(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let mut out = Vec::new(); + for block in yarn_classic_lock::scan_blocks(&text) { + // Our own vendored block: not a registry dependency. + if yarn_classic_lock::block_points_into_vendor(&block.lines) { + continue; + } + let patterns = yarn_classic_lock::split_key_patterns(&block.key); + let Some(name) = patterns + .first() + .and_then(|p| yarn_classic_lock::pattern_real_name(p)) + else { + continue; + }; + let Some(version) = yarn_classic_lock::classic_field(&block.lines, "version") else { + continue; + }; + let resolved_raw = yarn_classic_lock::classic_field(&block.lines, "resolved"); + // `resolved "url#sha1hex"` — the fragment is the legacy verifier. + let (resolved, sha1_hex) = match resolved_raw { + Some(raw) => match raw.split_once('#') { + Some((url, frag)) => ( + http_url(url), + (frag.len() == 40 && frag.bytes().all(|b| b.is_ascii_hexdigit())) + .then(|| frag.to_ascii_lowercase()), + ), + None => (http_url(raw), None), + }, + None => (None, None), + }; + let integrity = yarn_classic_lock::classic_field(&block.lines, "integrity") + .map(|i| LockIntegrity::Sri(i.to_string())) + .or(sha1_hex.map(LockIntegrity::Sha1Hex)) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, resolved, integrity)); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (berry) ───────────────────────────── + +async fn inventory_yarn_berry(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let mut out = Vec::new(); + // Berry reuses classic's block grammar (same scanner the berry backend + // imports); `__metadata` and workspace/patch/file resolutions are not + // registry packages. + for block in yarn_classic_lock::scan_blocks(&text) { + if block.key.starts_with("__metadata") { + continue; + } + let Some(resolution) = yarn_berry_lock::berry_field(&block.lines, "resolution") else { + continue; + }; + // Registry resolutions are `name@npm:` (a `::binding` + // suffix may follow). Anything else (workspace:/patch:/file:/link:) + // is skipped — including our own vendored file: resolutions. + let Some((name, reference)) = yarn_classic_lock::split_pattern(resolution) else { + continue; + }; + let Some(reference) = reference.strip_prefix("npm:") else { + continue; + }; + let version_from_res = reference.split("::").next().unwrap_or(reference); + let version = yarn_berry_lock::berry_field(&block.lines, "version") + .unwrap_or(version_from_res); + let integrity = yarn_berry_lock::berry_field(&block.lines, "checksum") + .map(|c| LockIntegrity::BerryChecksum(c.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, None, integrity)); + } + Some(out) +} + +// ──────────────────────────────── bun.lock ──────────────────────────────── + +async fn inventory_bun(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("bun.lock")).await.ok()?; + bun_lock::check_lock_version(&text).ok()?; + let lines: Vec = text.split('\n').map(str::to_string).collect(); + let entries = bun_lock::parse_packages_section(&lines).ok()?; + + let mut out = Vec::new(); + for entry in entries { + // Registry entries are 4-tuples `[spec, registry, {deps}, sha512]`; + // our vendored 3-tuples and other shapes are skipped. + if entry.elems.len() != 4 || !entry.elems[2].starts_with('{') { + continue; + } + let Some(spec) = entry.elems.first().and_then(|e| bun_lock::decode_json_string(e)) + else { + continue; + }; + let Some((name, version)) = bun_lock::split_name_spec(&spec) else { + continue; + }; + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let Some(registry) = bun_lock::decode_json_string(&entry.elems[1]) else { + continue; + }; + let Some(integrity) = bun_lock::decode_json_string(&entry.elems[3]) else { + continue; + }; + // elem[1] is `""` for the default registry; a full `.tgz` URL is + // used verbatim; any other base falls back to conventional URL + // construction (the integrity check still gates the content). + let resolved = (registry.ends_with(".tgz")) + .then(|| http_url(®istry)) + .flatten(); + out.push(LockfileEntry::npm( + name, + version, + resolved, + LockIntegrity::Sri(integrity), + )); + } + Some(out) +} + +// ────────────────────────────── composer.lock ────────────────────────────── + +/// Inventory `composer.lock` `packages`/`packages-dev`. The `dist.shasum` +/// (sha1 of the dist zip) is frequently empty — such entries stay +/// discovery-only. Names lowercase to the canonical packagist form; +/// versions drop the pretty leading `v`. +#[cfg(feature = "composer")] +pub async fn inventory_composer_lock(project_root: &Path) -> Option> { + let bytes = tokio::fs::read(project_root.join("composer.lock")).await.ok()?; + let doc: Value = serde_json::from_slice(&bytes).ok()?; + let mut out = Vec::new(); + for section in ["packages", "packages-dev"] { + let Some(list) = doc.get(section).and_then(Value::as_array) else { + continue; + }; + for pkg in list { + let Some(name) = pkg.get("name").and_then(Value::as_str) else { + continue; + }; + let Some(version) = pkg.get("version").and_then(Value::as_str) else { + continue; + }; + let name = name.to_ascii_lowercase(); + let version = version + .strip_prefix('v') + .filter(|r| r.chars().next().is_some_and(|c| c.is_ascii_digit())) + .unwrap_or(version) + .to_string(); + if !path_safety::is_safe_multi_segment(&name) + || name.split('/').count() != 2 + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let dist = pkg.get("dist"); + let dist_url = dist + .and_then(|d| d.get("url")) + .and_then(Value::as_str) + .unwrap_or(""); + // Our own vendored entries use a path dist — skip. + if dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "path") + || parse_vendor_path(dist_url).is_some() + { + continue; + } + let is_zip = dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "zip"); + let shasum = dist + .and_then(|d| d.get("shasum")) + .and_then(Value::as_str) + .unwrap_or(""); + let integrity = if is_zip + && shasum.len() == 40 + && shasum.bytes().all(|b| b.is_ascii_hexdigit()) + { + LockIntegrity::Sha1Hex(shasum.to_ascii_lowercase()) + } else { + LockIntegrity::None + }; + let purl = format!("pkg:composer/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "composer", + name, + version, + purl, + resolved: is_zip.then(|| http_url(dist_url)).flatten(), + integrity, + }); + } + } + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────── Gemfile.lock ────────────────────────────── + +/// Inventory `Gemfile.lock`: `GEM`-section `specs:` entries (4-space +/// indent; deeper lines are dependency ranges) plus the bundler ≥ 2.6 +/// `CHECKSUMS` section's sha256 values when present (older locks stay +/// discovery-only). Platform-suffixed specs (`nokogiri (1.16.5-arm64-…)`) +/// are skipped — platform gems are unsupported for vendoring anyway. +pub async fn inventory_gemfile_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Gemfile.lock")) + .await + .ok()?; + let mut remote: Option = None; + let mut checksums: HashMap<(String, String), String> = HashMap::new(); + let mut specs: Vec<(String, String)> = Vec::new(); + + let mut section = ""; + let mut in_specs = false; + for line in text.lines() { + if !line.starts_with(' ') { + section = line.trim(); + in_specs = false; + continue; + } + let trimmed = line.trim_start(); + let indent = line.len() - trimmed.len(); + match section { + "GEM" => { + if indent == 2 { + if let Some(r) = trimmed.strip_prefix("remote:") { + let r = r.trim().trim_end_matches('/'); + if remote.is_none() && !r.is_empty() { + remote = Some(r.to_string()); + } + } + in_specs = trimmed == "specs:"; + } else if in_specs && indent == 4 { + if let Some((name, version)) = parse_gem_spec_line(trimmed) { + specs.push((name, version)); + } + } + } + "CHECKSUMS" => { + // ` name (version) sha256=hex` + if let Some((spec_part, hash_part)) = + trimmed.rsplit_once(" sha256=").map(|(s, h)| (s, h.trim())) + { + if let Some((name, version)) = parse_gem_spec_line(spec_part) { + if hash_part.len() == 64 + && hash_part.bytes().all(|b| b.is_ascii_hexdigit()) + { + checksums + .insert((name, version), hash_part.to_ascii_lowercase()); + } + } + } + } + _ => {} + } + } + if specs.is_empty() { + return None; + } + let base = remote.unwrap_or_else(|| "https://rubygems.org".to_string()); + let mut out = Vec::new(); + for (name, version) in specs { + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let integrity = checksums + .get(&(name.clone(), version.clone())) + .map(|h| LockIntegrity::Sha256Hex(h.clone())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry { + ecosystem: "gem", + purl: format!("pkg:gem/{name}@{version}"), + resolved: http_url(&format!("{base}/downloads/{name}-{version}.gem")), + name, + version, + integrity, + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// `name (version)` → parts; platform-suffixed versions (`1.2.3-x86_64…`) +/// and dependency lines (no parens / range operators) yield `None`. +fn parse_gem_spec_line(line: &str) -> Option<(String, String)> { + let (name, rest) = line.split_once(" (")?; + let version = rest.strip_suffix(')')?; + if name.is_empty() + || version.is_empty() + || version.contains(' ') + || version.contains('-') + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + return None; + } + Some((name.to_string(), version.to_string())) +} + +// ─────────────────────────────── pypi locks ─────────────────────────────── + +/// PEP 503 name normalization (`Foo._Bar` → `foo-bar`) — pypi purls and +/// lock entries must compare in this form. +fn pep503(name: &str) -> String { + let mut out = String::with_capacity(name.len()); + let mut last_dash = false; + for c in name.chars() { + let c = c.to_ascii_lowercase(); + if c == '-' || c == '_' || c == '.' { + if !last_dash { + out.push('-'); + last_dash = true; + } + } else { + out.push(c); + last_dash = false; + } + } + out +} + +/// Inventory the pypi lock the project carries. Fetchable resolution +/// (URL + sha256 of a pure `py3-none-any` wheel) comes from `uv.lock`; +/// `poetry.lock` and `--hash`-pinned `requirements.txt` contribute +/// DISCOVERY-only entries (no recorded URL; platform-independent wheel +/// choice is not derivable offline). Pipenv/pdm locks: not yet read. +pub async fn inventory_pypi_locks(project_root: &Path) -> Option> { + if let Some(out) = inventory_uv_lock(project_root).await { + return Some(out); + } + if let Some(out) = inventory_poetry_lock(project_root).await { + return Some(out); + } + inventory_requirements_txt(project_root).await +} + +/// uv.lock: TOML `[[package]]` blocks with `name`/`version` and +/// `wheels = [{ url, hash = "sha256:…" }, …]` entries. +async fn inventory_uv_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("uv.lock")) + .await + .ok()?; + let mut out = Vec::new(); + // Line-oriented: uv emits `[[package]]` blocks; wheels live either as + // inline `{ url = "…", hash = "sha256:…" }` table rows or one-line + // arrays. A pure-python wheel ends `py3-none-any.whl`. + let mut name: Option = None; + let mut version: Option = None; + let mut sourced_registry = true; + let mut wheel: Option<(String, String)> = None; + let flush = |name: &mut Option, + version: &mut Option, + sourced_registry: &mut bool, + wheel: &mut Option<(String, String)>, + out: &mut Vec| { + if let (Some(n), Some(v)) = (name.take(), version.take()) { + let canonical = pep503(&n); + if *sourced_registry + && path_safety::is_safe_single_segment(&canonical) + && path_safety::is_safe_single_segment(&v) + { + let (resolved, integrity) = match wheel.take() { + Some((url, sha)) => (http_url(&url), LockIntegrity::Sha256Hex(sha)), + None => (None, LockIntegrity::None), + }; + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{canonical}@{v}"), + name: canonical, + version: v, + resolved, + integrity, + }); + } + } + *sourced_registry = true; + *wheel = None; + }; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(v.trim_matches('"').to_string()); + } else if let Some(v) = t.strip_prefix("version = ") { + version = Some(v.trim_matches('"').to_string()); + } else if t.starts_with("source = ") { + // Registry packages: `source = { registry = "…" }`; editable/ + // virtual/path/git sources are not fetchable artifacts. + sourced_registry = t.contains("registry"); + } else if wheel.is_none() && t.contains("py3-none-any.whl") { + // `{ url = "…py3-none-any.whl", hash = "sha256:…" }` + let url = t + .split("url = \"") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + let sha = t + .split("hash = \"sha256:") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + if !url.is_empty() && sha.len() == 64 && sha.bytes().all(|b| b.is_ascii_hexdigit()) { + wheel = Some((url.to_string(), sha.to_ascii_lowercase())); + } + } + } + flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + Some(dedup_prefer_integrity(out)) +} + +/// poetry.lock: `[[package]]` blocks with `name`/`version` — discovery +/// only (file hashes exist but carry no URLs and no platform choice). +async fn inventory_poetry_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("poetry.lock")) + .await + .ok()?; + let mut out = Vec::new(); + let mut in_package = false; + let mut name: Option = None; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + in_package = true; + name = None; + continue; + } + if t.starts_with('[') && t != "[[package]]" { + in_package = false; + continue; + } + if !in_package { + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(pep503(v.trim_matches('"'))); + } else if let Some(v) = t.strip_prefix("version = ") { + if let Some(n) = name.take() { + let v = v.trim_matches('"').to_string(); + if path_safety::is_safe_single_segment(&n) + && path_safety::is_safe_single_segment(&v) + { + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{n}@{v}"), + name: n, + version: v, + resolved: None, + integrity: LockIntegrity::None, + }); + } + } + } + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +/// requirements.txt with exact `==` pins — discovery only. +async fn inventory_requirements_txt(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("requirements.txt")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let t = line.trim(); + if t.is_empty() || t.starts_with('#') || t.starts_with('-') { + continue; + } + // `name==version` (strip extras, env markers, hash continuations). + let spec = t.split(';').next().unwrap_or(t).trim(); + let spec = spec.split_whitespace().next().unwrap_or(spec); + let Some((raw_name, version)) = spec.split_once("==") else { + continue; + }; + let name = pep503(raw_name.split('[').next().unwrap_or(raw_name).trim()); + let version = version.trim().to_string(); + if name.is_empty() + || !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{name}@{version}"), + name, + version, + resolved: None, + integrity: LockIntegrity::None, + }); + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +#[cfg(test)] +mod tests { + use super::*; + + async fn write(root: &Path, name: &str, content: &str) { + tokio::fs::write(root.join(name), content).await.unwrap(); + } + + fn entry<'a>(entries: &'a [LockfileEntry], name: &str) -> &'a LockfileEntry { + entries + .iter() + .find(|e| e.name == name) + .unwrap_or_else(|| panic!("no entry for {name}: {entries:?}")) + } + + // ── package-lock ────────────────────────────────────────────────────── + + const PACKAGE_LOCK: &str = r#"{ + "name": "fixture", + "version": "1.0.0", + "lockfileVersion": 3, + "packages": { + "": { "name": "fixture", "version": "1.0.0" }, + "packages/member": { "name": "member", "version": "0.0.1" }, + "node_modules/member": { "resolved": "packages/member", "link": true }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-XI5MPz==" + }, + "node_modules/@scope/pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "integrity": "sha512-scoped==" + }, + "node_modules/bundled-dep": { + "version": "1.0.0", + "inBundle": true + }, + "node_modules/git-dep": { + "version": "0.5.0", + "resolved": "git+ssh://git@github.com/x/git-dep.git#abc" + }, + "node_modules/vendored": { + "version": "3.0.0", + "resolved": "file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", + "integrity": "sha512-ours==" + }, + "node_modules/evil": { + "version": "../../escape", + "resolved": "https://registry.npmjs.org/evil/-/evil-1.0.0.tgz", + "integrity": "sha512-evil==" + } + } +} +"#; + + #[tokio::test] + async fn package_lock_inventories_registry_entries() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::PackageLock); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!(lp.purl, "pkg:npm/left-pad@1.3.0"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz") + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + let scoped = entry(&entries, "@scope/pkg"); + assert_eq!(scoped.purl, "pkg:npm/@scope/pkg@2.0.0"); + + // git deps stay listed (discovery) but carry no fetchable URL. + let git = entry(&entries, "git-dep"); + assert_eq!(git.resolved, None); + assert_eq!(git.integrity, LockIntegrity::None); + + // Workspace members, links, bundled deps, our vendored spec, and + // the unsafe-version entry are all absent. + for absent in ["member", "fixture", "bundled-dep", "vendored", "evil"] { + assert!( + !entries.iter().any(|e| e.name == absent), + "{absent} must not be inventoried: {entries:?}" + ); + } + } + + #[tokio::test] + async fn shrinkwrap_wins_over_package_lock() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + write( + tmp.path(), + "npm-shrinkwrap.json", + r#"{ "lockfileVersion": 3, "packages": { + "node_modules/only-in-shrinkwrap": { "version": "9.9.9" } } }"#, + ) + .await; + + let (_, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert!(entries.iter().any(|e| e.name == "only-in-shrinkwrap")); + assert!(!entries.iter().any(|e| e.name == "left-pad")); + } + + #[tokio::test] + async fn legacy_v1_lock_without_packages_map_yields_none() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "package-lock.json", + r#"{ "lockfileVersion": 1, "dependencies": { "left-pad": { "version": "1.3.0" } } }"#, + ) + .await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } + + // ── pnpm ────────────────────────────────────────────────────────────── + + const PNPM_LOCK: &str = "lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + +importers: + + .: + dependencies: + left-pad: + specifier: 1.3.0 + version: 1.3.0 + +packages: + + left-pad@1.3.0: + resolution: {integrity: sha512-XI5MPz==} + + '@scope/pkg@2.0.0': + resolution: {integrity: sha512-scoped==} + + peer-user@4.0.0(left-pad@1.3.0): + resolution: {integrity: sha512-peer==} + + local-thing@file:packages/local: + resolution: {directory: packages/local, type: directory} + + vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz: + resolution: {integrity: sha512-ours==, tarball: file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz} + +snapshots: + + left-pad@1.3.0: {} +"; + + #[tokio::test] + async fn pnpm_v9_keys_parse_with_peer_suffix_and_scoped_quoting() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", PNPM_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Pnpm); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + assert_eq!(entry(&entries, "peer-user").version, "4.0.0"); + // registry entries carry no URL in v9 — constructed at fetch time. + assert_eq!(entry(&entries, "left-pad").resolved, None); + for absent in ["local-thing", "vendored"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── yarn classic ────────────────────────────────────────────────────── + + const YARN_CLASSIC: &str = "# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +\"@scope/pkg@^2.0.0\": + version \"2.0.0\" + resolved \"https://registry.yarnpkg.com/@scope/pkg/-/pkg-2.0.0.tgz#aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\" + integrity sha512-scoped== + +left-pad@1.3.0, left-pad@^1.3.0: + version \"1.3.0\" + resolved \"https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz#bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\" + integrity sha512-XI5MPz== + +old-school@0.1.0: + version \"0.1.0\" + resolved \"https://registry.yarnpkg.com/old-school/-/old-school-0.1.0.tgz#cccccccccccccccccccccccccccccccccccccccc\" + +aliased@npm:real-name@^3.0.0: + version \"3.0.0\" + resolved \"https://registry.yarnpkg.com/real-name/-/real-name-3.0.0.tgz#dddddddddddddddddddddddddddddddddddddddd\" + integrity sha512-alias== +"; + + #[tokio::test] + async fn yarn_classic_blocks_yield_resolved_sha1_and_integrity() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_CLASSIC).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnClassic); + + let lp = entry(&entries, "left-pad"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz"), + "the #sha1 fragment is split off the URL" + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + // Integrity-less old locks fall back to the sha1 fragment. + assert_eq!( + entry(&entries, "old-school").integrity, + LockIntegrity::Sha1Hex("c".repeat(40)) + ); + + // `alias@npm:real@range` resolves to the real name. + assert!(entries.iter().any(|e| e.name == "real-name")); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + } + + // ── yarn berry ──────────────────────────────────────────────────────── + + const YARN_BERRY: &str = "# This file is generated by running \"yarn install\" inside your project. +# Manifest files (package.json) are also used. + +__metadata: + version: 8 + cacheKey: 10c0 + +\"fixture@workspace:.\": + version: 0.0.0-use.local + resolution: \"fixture@workspace:.\" + languageName: unknown + linkType: soft + +\"left-pad@npm:1.3.0\": + version: 1.3.0 + resolution: \"left-pad@npm:1.3.0\" + checksum: 10c0/deadbeefcafe== + languageName: node + linkType: hard + +\"@scope/pkg@npm:^2.0.0\": + version: 2.0.0 + resolution: \"@scope/pkg@npm:2.0.0\" + checksum: 10c0/scopedchecksum== + languageName: node + linkType: hard +"; + + #[tokio::test] + async fn yarn_berry_registry_resolutions_inventory_with_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_BERRY).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnBerry); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!( + lp.integrity, + LockIntegrity::BerryChecksum("10c0/deadbeefcafe==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + // The workspace root is not a registry package. + assert!(!entries.iter().any(|e| e.name == "fixture"), "{entries:?}"); + } + + // ── bun ─────────────────────────────────────────────────────────────── + + const BUN_LOCK: &str = r#"{ + "lockfileVersion": 1, + "workspaces": { + "": { "name": "fixture", "dependencies": { "left-pad": "1.3.0" } }, + }, + "packages": { + "left-pad": ["left-pad@1.3.0", "", {}, "sha512-XI5MPz=="], + "@scope/pkg": ["@scope/pkg@2.0.0", "", {}, "sha512-scoped=="], + "vendored": ["vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", {}], + "linked": ["linked@workspace:packages/linked", {}], + } +} +"#; + + #[tokio::test] + async fn bun_registry_tuples_parse_and_locals_are_skipped() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "bun.lock", BUN_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Bun); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "left-pad").resolved, None); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + for absent in ["vendored", "linked"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── shared semantics ────────────────────────────────────────────────── + + #[tokio::test] + async fn lookup_bridges_percent_encoded_purls() { + let entries = vec![ + LockfileEntry::npm("@scope/pkg", "2.0.0", None, LockIntegrity::None), + LockfileEntry::npm("left-pad", "1.3.0", None, LockIntegrity::None), + ]; + assert!(lookup(&entries, "pkg:npm/%40scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/@scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@1.3.0?artifact_id=x").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@9.9.9").is_none()); + assert!(lookup(&entries, "pkg:pypi/left-pad@1.3.0").is_none()); + } + + #[tokio::test] + async fn dedup_prefers_integrity_bearing_instance() { + let raw = vec![ + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::Sri("sha512-x==".into())), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + ]; + let out = finalize_npm(raw); + assert_eq!(out.len(), 1); + assert_eq!(out[0].integrity, LockIntegrity::Sri("sha512-x==".into())); + } + + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_lock_inventories_crates_io_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Cargo.lock", + r#"# This file is automatically @generated by Cargo. +version = 4 + +[[package]] +name = "fixture" +version = "0.1.0" + +[[package]] +name = "serde" +version = "1.0.200" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" + +[[package]] +name = "git-dep" +version = "0.5.0" +source = "git+https://github.com/x/git-dep?rev=abc#abc" + +[[package]] +name = "sparse-crate" +version = "2.0.0" +source = "sparse+https://index.crates.io/" +checksum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +"#, + ) + .await; + + let entries = inventory_cargo_lock(tmp.path()).await.unwrap(); + let serde_entry = entry(&entries, "serde"); + assert_eq!(serde_entry.version, "1.0.200"); + assert_eq!(serde_entry.purl, "pkg:cargo/serde@1.0.200"); + assert_eq!( + serde_entry.integrity, + LockIntegrity::Sha256Hex( + "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f".into() + ) + ); + assert!(matches!( + entry(&entries, "sparse-crate").integrity, + LockIntegrity::Sha256Hex(_) + )); + // Workspace member (no source) excluded; git source unverifiable. + assert!(!entries.iter().any(|e| e.name == "fixture")); + assert_eq!(entry(&entries, "git-dep").integrity, LockIntegrity::None); + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn go_sum_inventories_module_zip_lines() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "go.sum", + "github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=\n\ + github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=\n\ + golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=\n", + ) + .await; + + let entries = inventory_go_sum(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 2, "the /go.mod line is skipped: {entries:?}"); + let gin = entry(&entries, "github.com/gin-gonic/gin"); + assert_eq!(gin.version, "v1.9.1"); + assert_eq!(gin.purl, "pkg:golang/github.com/gin-gonic/gin@v1.9.1"); + assert_eq!( + gin.integrity, + LockIntegrity::GoH1("h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=".into()) + ); + } + + #[tokio::test] + async fn lookup_matches_cargo_and_golang_purls() { + let entries = vec![ + LockfileEntry { + ecosystem: "cargo", + name: "serde".into(), + version: "1.0.200".into(), + purl: "pkg:cargo/serde@1.0.200".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + ]; + assert!(lookup(&entries, "pkg:cargo/serde@1.0.200").is_some()); + assert!(lookup(&entries, "pkg:golang/github.com/x/y@v1.0.0").is_some()); + assert!(lookup(&entries, "pkg:cargo/serde@9.9.9").is_none()); + assert!( + lookup(&entries, "pkg:npm/serde@1.0.200").is_none(), + "ecosystem tags must match, not just name@version" + ); + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_lock_inventories_dist_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "composer.lock", + r#"{ + "packages": [ + { + "name": "Monolog/Monolog", + "version": "v3.5.0", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/abc", + "shasum": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + } + }, + { + "name": "vendored/pkg", + "version": "1.0.0", + "dist": { "type": "path", "url": ".socket/vendor/composer/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored/pkg@1.0.0" } + } + ], + "packages-dev": [ + { + "name": "symfony/console", + "version": "v6.4.1", + "dist": { "type": "zip", "url": "https://example.com/console.zip", "shasum": "" } + } + ] +}"#, + ) + .await; + + let entries = inventory_composer_lock(tmp.path()).await.unwrap(); + let monolog = entry(&entries, "monolog/monolog"); + assert_eq!(monolog.version, "3.5.0", "leading v dropped, name lowercased"); + assert_eq!(monolog.purl, "pkg:composer/monolog/monolog@3.5.0"); + assert!(matches!(monolog.integrity, LockIntegrity::Sha1Hex(_))); + assert!(monolog.resolved.as_deref().unwrap().contains("zipball")); + // Empty shasum → discovery-only; path dist (ours) excluded. + assert_eq!( + entry(&entries, "symfony/console").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "vendored/pkg")); + } + + #[tokio::test] + async fn gemfile_lock_inventories_specs_and_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Gemfile.lock", + "GEM\n remote: https://rubygems.org/\n specs:\n rails (7.1.0)\n \ + actionpack (= 7.1.0)\n rack (3.0.8)\n nokogiri (1.16.5-arm64-darwin)\n\n\ + PLATFORMS\n ruby\n\nDEPENDENCIES\n rails\n\nCHECKSUMS\n \ + rails (7.1.0) sha256=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n\n\ + BUNDLED WITH\n 2.6.0\n", + ) + .await; + + let entries = inventory_gemfile_lock(tmp.path()).await.unwrap(); + let rails = entry(&entries, "rails"); + assert_eq!(rails.version, "7.1.0"); + assert_eq!(rails.purl, "pkg:gem/rails@7.1.0"); + assert!(matches!(rails.integrity, LockIntegrity::Sha256Hex(_))); + assert_eq!( + rails.resolved.as_deref(), + Some("https://rubygems.org/downloads/rails-7.1.0.gem") + ); + // No CHECKSUMS entry → discovery-only; platform gem skipped; + // dependency range lines never parse as specs. + assert_eq!(entry(&entries, "rack").integrity, LockIntegrity::None); + assert!(!entries.iter().any(|e| e.name == "nokogiri")); + assert!(!entries.iter().any(|e| e.name == "actionpack")); + } + + #[tokio::test] + async fn uv_lock_inventories_pure_wheels() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "uv.lock", + r#"version = 1 + +[[package]] +name = "Requests" +version = "2.28.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/requests-2.28.0-py3-none-any.whl", hash = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" }, +] + +[[package]] +name = "native-only" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/native_only-1.0.0-cp312-macosx.whl", hash = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" }, +] + +[[package]] +name = "local-proj" +version = "0.0.1" +source = { editable = "." } +"#, + ) + .await; + + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let requests = entry(&entries, "requests"); + assert_eq!(requests.purl, "pkg:pypi/requests@2.28.0", "PEP 503 name"); + assert!(matches!(requests.integrity, LockIntegrity::Sha256Hex(_))); + assert!(requests + .resolved + .as_deref() + .unwrap() + .ends_with("py3-none-any.whl")); + // Platform-only wheels → discovery-only; editable sources excluded. + assert_eq!( + entry(&entries, "native-only").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "local-proj")); + } + + #[tokio::test] + async fn poetry_and_requirements_are_discovery_only() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "poetry.lock", + "[[package]]\nname = \"Flask_Login\"\nversion = \"0.6.3\"\n\n[metadata]\nlock-version = \"2.0\"\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let fl = entry(&entries, "flask-login"); + assert_eq!(fl.purl, "pkg:pypi/flask-login@0.6.3"); + assert_eq!(fl.integrity, LockIntegrity::None); + + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "requirements.txt", + "# pinned\nrequests[security]==2.28.0 --hash=sha256:abc \\\n --hash=sha256:def\nflask>=2.0\n-e .\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 1, "{entries:?}"); + assert_eq!(entries[0].purl, "pkg:pypi/requests@2.28.0"); + } + + #[tokio::test] + async fn unsupported_flavors_yield_none() { + // PnP marker wins over any lockfile. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), ".pnp.cjs", "/* pnp */").await; + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // pnpm v6. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", "lockfileVersion: '6.0'\n").await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // No lockfile at all. + let tmp = tempfile::tempdir().unwrap(); + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 1aa70cc..05fddbf 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -53,6 +53,8 @@ pub mod cargo_lock; #[cfg(feature = "composer")] pub mod composer_lock; pub mod gem; +pub mod lock_inventory; +pub mod registry_fetch; #[cfg(feature = "golang")] pub mod golang; mod npm_common; diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 7b21bac..0174f6c 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -1581,7 +1581,7 @@ async fn commit_pair( // pnpm-lock.yaml is machine-emitted with a fixed 2/4/6/8-space shape; these // helpers splice line blocks and never interpret YAML generically. -fn split_lines(text: &str) -> Vec { +pub(super) fn split_lines(text: &str) -> Vec { text.split('\n').map(str::to_string).collect() } @@ -1592,7 +1592,7 @@ fn join_lines(lines: &[String]) -> String { /// `(header_idx, end_idx)` of a top-level `name:` section; `end` is the /// first following column-0 line (exclusive), so trailing blank separator /// lines belong to the section. -fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { +pub(super) fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { let header = format!("{name}:"); let start = lines.iter().position(|l| l == &header)?; let end = lines @@ -1608,10 +1608,10 @@ fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { /// One 2-space-keyed block inside a section (`[header, end)`; `end` stops at /// the blank separator / next block header, so the captured fragment is the /// verbatim entry without surrounding blanks). -struct YamlBlock { - header: usize, - end: usize, - key: String, +pub(super) struct YamlBlock { + pub(super) header: usize, + pub(super) end: usize, + pub(super) key: String, /// The key exactly as spelled in the file (incl. quotes) — rekeys /// preserve the file's quoting style. repr: String, @@ -1631,7 +1631,7 @@ impl YamlBlock { } /// The next block at or after line `i` (within `[i, end)`). -fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { +pub(super) fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { while i < end { if let Some((key, repr, rest)) = parse_key_line(&lines[i], 2) { let mut j = i + 1; diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs new file mode 100644 index 0000000..3cef46b --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -0,0 +1,1435 @@ +//! Pristine-artifact fetching for lockfile-resolved packages with no +//! installed copy. +//! +//! `vendor` needs an installed package dir to stage from; on a fresh clone +//! there is none. This module downloads the pristine artifact the lockfile +//! resolves (the lock-recorded URL when present, the conventional registry +//! URL otherwise), verifies it against the integrity the lock records +//! **FAIL-CLOSED and before anything is written to the staging dir**, and +//! extracts it into a private tempdir the vendor pipeline then treats as +//! the installed dir. The project tree — node_modules included — is never +//! touched. +//! +//! Trust model: the URL comes from the user's own committed lockfile (or a +//! conventional construction from it); content trust comes from the +//! lock-recorded hash, not the transport — which is also why an entry with +//! no verifier ([`LockIntegrity::None`]) is refused outright +//! ([`FetchError::Unverifiable`]) without any network I/O. + +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use base64::Engine as _; +use sha1::Sha1; +use sha2::{Digest, Sha256, Sha384, Sha512}; + +use crate::constants::USER_AGENT; +use crate::patch::apply::is_safe_relative_subpath; + +use super::lock_inventory::{LockIntegrity, LockfileEntry}; + +/// The default npm registry; override with `SOCKET_NPM_REGISTRY` (the +/// enterprise-mirror / test escape hatch — `.npmrc` parsing is out of +/// scope, but lock-recorded `resolved` URLs already carry custom hosts). +pub const DEFAULT_NPM_REGISTRY: &str = "https://registry.npmjs.org"; + +/// Whole-package caps — wider than `patch/package.rs`'s patch-archive caps +/// because these are full upstream packages, but still bounded so a +/// poisoned lockfile cannot turn the fetch into a disk/memory bomb. +const MAX_DOWNLOAD_BYTES: u64 = 128 * 1024 * 1024; +const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024; +const MAX_ENTRY_BYTES: u64 = 128 * 1024 * 1024; +const MAX_ENTRIES: usize = 60_000; + +/// A fetched, verified, extracted package. The tempdir lives exactly as +/// long as this value — callers must hold it until the vendor pipeline has +/// finished staging from [`FetchedPackage::dir`]. +#[derive(Debug)] +pub struct FetchedPackage { + dir: PathBuf, + /// Where the bytes came from (surfaced in the fetch warning event). + pub url: String, + _tmp: tempfile::TempDir, +} + +impl FetchedPackage { + /// The extracted package root (`package.json` at the top for npm). + pub fn dir(&self) -> &Path { + &self.dir + } +} + +#[derive(Debug)] +pub enum FetchError { + /// The entry cannot be verified against the lockfile (no integrity + /// recorded, or no fetcher for its ecosystem) — decided BEFORE any + /// network I/O; the caller keeps its `package_not_installed` outcome. + Unverifiable(String), + /// The fetch was attempted and failed (HTTP error, size cap, integrity + /// mismatch, extraction failure). User-facing message. + Failed(String), +} + +/// One shared client for all fetches in a run. +pub fn build_registry_client() -> reqwest::Client { + reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(Duration::from_secs(60)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()) +} + +/// The npm registry base after the env override. +pub fn npm_registry_base() -> String { + std::env::var("SOCKET_NPM_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_NPM_REGISTRY.to_string()) +} + +/// Conventional npm tarball URL: the scope stays in the package path, the +/// tarball leaf uses the bare name — +/// `{base}/@scope/name/-/name-1.0.0.tgz` / `{base}/name/-/name-1.0.0.tgz`. +pub fn npm_tarball_url(base: &str, name: &str, version: &str) -> String { + let leaf = name.rsplit('/').next().unwrap_or(name); + format!("{base}/{name}/-/{leaf}-{version}.tgz") +} + +/// Fetch + verify + extract one lockfile entry. Ecosystems without a +/// fetcher yet return [`FetchError::Unverifiable`] (callers keep their +/// not-installed outcome). +pub async fn fetch_and_stage( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + if entry.integrity == LockIntegrity::None { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no integrity hash for {}@{}; refusing to fetch \ + unverifiable content", + entry.name, entry.version + ))); + } + match entry.ecosystem { + "npm" => fetch_npm(entry, client).await, + #[cfg(feature = "cargo")] + "cargo" => fetch_cargo(entry, client).await, + #[cfg(feature = "golang")] + "golang" => fetch_golang(entry, client).await, + #[cfg(feature = "composer")] + "composer" => fetch_composer(entry, client).await, + "gem" => fetch_gem(entry, client).await, + "pypi" => fetch_pypi(entry, client).await, + other => Err(FetchError::Unverifiable(format!( + "no registry fetcher for ecosystem `{other}`" + ))), + } +} + +/// Traversal-guarded zip extraction. `strip_first` mirrors the tar +/// behavior (composer dist zips carry a variable top dir; wheels carry +/// content at the root). +fn extract_zip(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("zip exceeds {MAX_ENTRIES} entries")); + } + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let raw = PathBuf::from(file.name()); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy().into_owned(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "zip entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "zip entry `{rel_str}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +/// Composer dist zips (packagist/GitHub zipballs): sha1-verified, variable +/// top dir stripped. The extracted dir plays the installed package dir. +#[cfg(feature = "composer")] +async fn fetch_composer( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "composer.lock records no dist URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_zip(&bytes, &dir, /*strip_first=*/ true).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("composer.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched dist for {}@{} carries no composer.json", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// `.gem` files are plain tar containers holding `data.tar.gz` (the +/// package content, no prefix dir) + metadata. The whole `.gem` is +/// sha256-verified against the Gemfile.lock CHECKSUMS entry first. +async fn fetch_gem( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "no download URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + // Locate data.tar.gz inside the (uncompressed) outer tar. + let mut archive = tar::Archive::new(bytes.as_slice()); + let mut data: Option> = None; + for e in archive + .entries() + .map_err(|e| FetchError::Failed(format!("unreadable .gem: {e}")))? + { + use std::io::Read as _; + let mut e = e.map_err(|err| FetchError::Failed(format!("unreadable .gem entry: {err}")))?; + let is_data = e + .path() + .ok() + .is_some_and(|p| p.as_os_str() == "data.tar.gz"); + if !is_data { + continue; + } + if e.header().size().unwrap_or(u64::MAX) > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed("data.tar.gz exceeds the size cap".into())); + } + let mut buf = Vec::new(); + e.read_to_end(&mut buf) + .map_err(|err| FetchError::Failed(format!("cannot read data.tar.gz: {err}")))?; + data = Some(buf); + break; + } + let Some(data) = data else { + return Err(FetchError::Failed(format!( + "fetched .gem for {}@{} carries no data.tar.gz", + entry.name, entry.version + ))); + }; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("gem"); + extract_tgz_no_strip(&data, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Pure-python wheels recorded by uv.lock (URL + sha256): the unzipped +/// wheel IS a site-packages layout (package dirs + `.dist-info/RECORD` at +/// the root), which is exactly the shape the pypi vendor backend stages +/// from. +async fn fetch_pypi( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no platform-independent wheel URL for {}@{} (only uv.lock carries fetchable wheel resolutions today)", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("site-packages"); + extract_zip(&bytes, &dir, /*strip_first=*/ false).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// crates.io static download host; override with `SOCKET_CRATES_REGISTRY`. +#[cfg(feature = "cargo")] +pub const DEFAULT_CRATES_REGISTRY: &str = "https://static.crates.io/crates"; + +#[cfg(feature = "cargo")] +fn crates_registry_base() -> String { + std::env::var("SOCKET_CRATES_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_CRATES_REGISTRY.to_string()) +} + +/// `.crate` files are tar.gz with a `{name}-{version}/` top dir — the same +/// extraction path as npm tarballs. The Cargo.lock `checksum` is the sha256 +/// of the `.crate` bytes. +#[cfg(feature = "cargo")] +async fn fetch_cargo( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/{}-{}.crate", + crates_registry_base(), + entry.name, + entry.name, + entry.version + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("crate"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("Cargo.toml")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched .crate for {}@{} carries no Cargo.toml — not a crate", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Default Go module proxy; `SOCKET_GOPROXY` wins, else the standard +/// `GOPROXY` env (first element that isn't `direct`/`off`). +#[cfg(feature = "golang")] +pub const DEFAULT_GOPROXY: &str = "https://proxy.golang.org"; + +#[cfg(feature = "golang")] +fn goproxy_base() -> String { + if let Ok(v) = std::env::var("SOCKET_GOPROXY") { + let v = v.trim_end_matches('/').to_string(); + if !v.is_empty() { + return v; + } + } + if let Ok(v) = std::env::var("GOPROXY") { + for part in v.split(',') { + let part = part.trim().trim_end_matches('/'); + if !part.is_empty() && part != "direct" && part != "off" { + return part.to_string(); + } + } + } + DEFAULT_GOPROXY.to_string() +} + +/// Go's module-path case encoding for proxy URLs: an uppercase letter `X` +/// becomes `!x` (applies to the module path and the version). +#[cfg(feature = "golang")] +fn go_escape(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_uppercase() { + out.push('!'); + out.push(c.to_ascii_lowercase()); + } else { + out.push(c); + } + } + out +} + +/// go.sum's `h1:` dirhash over a module zip: sha256 of the sorted +/// `"{sha256hex(content)} {entry name}\n"` lines, base64-encoded +/// (golang.org/x/mod/sumdb/dirhash Hash1/HashZip). Computed in memory +/// BEFORE extraction. +#[cfg(feature = "golang")] +fn go_h1_of_zip(bytes: &[u8]) -> Result { + use std::io::Read as _; + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("module zip exceeds {MAX_ENTRIES} entries")); + } + let mut files: Vec<(String, String)> = Vec::new(); + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; // go module zips carry files only + } + let name = file.name().to_string(); + if name.contains('\n') { + return Err("module zip entry name contains a newline".to_string()); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "module zip entry `{name}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "module zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let mut hasher = Sha256::new(); + let mut buf = [0u8; 64 * 1024]; + loop { + let n = file + .read(&mut buf) + .map_err(|e| format!("cannot read module zip entry `{name}`: {e}"))?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + files.push((name, hex::encode(hasher.finalize()))); + } + files.sort_by(|a, b| a.0.cmp(&b.0)); + let mut h = Sha256::new(); + for (name, content_hex) in &files { + h.update(format!("{content_hex} {name}\n").as_bytes()); + } + Ok(format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(h.finalize()) + )) +} + +/// Traversal-guarded zip extraction with an EXPLICIT required prefix +/// (`@/` — go module paths contain slashes, so a +/// first-component strip would be wrong). Same guard family as +/// [`extract_tgz`]; an entry outside the prefix fails the whole artifact. +#[cfg(feature = "golang")] +fn extract_zip_with_prefix(bytes: &[u8], dest: &Path, prefix: &str) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let name = file.name().to_string(); + let Some(rel) = name.strip_prefix(prefix) else { + return Err(format!( + "module zip entry `{name}` lies outside `{prefix}` — refusing the artifact" + )); + }; + if !is_safe_relative_subpath(rel) { + return Err(format!( + "module zip entry `{name}` escapes the extraction dir — refusing the artifact" + )); + } + let target = dest.join(rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out).map_err(|e| format!("cannot extract `{rel}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(feature = "golang")] +async fn fetch_golang( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let LockIntegrity::GoH1(expected) = &entry.integrity else { + return Err(FetchError::Unverifiable( + "go module entries verify via the go.sum h1 dirhash only".to_string(), + )); + }; + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/@v/{}.zip", + goproxy_base(), + go_escape(&entry.name), + go_escape(&entry.version) + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + let actual = go_h1_of_zip(&bytes).map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "go.sum dirhash mismatch: lockfile records {expected}, the fetched module zip \ + hashes to {actual}" + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("module"); + let prefix = format!("{}@{}/", entry.name, entry.version); + extract_zip_with_prefix(&bytes, &dir, &prefix).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +async fn fetch_npm( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + match &entry.integrity { + // yarn berry locks never hash the tarball itself — the checksum is + // sha512 of the deterministic cache zip. Rebuild it from the fetched + // bytes (the same spike-pinned recipe the berry wiring uses) and + // compare. Only cacheKey 10c0 (yarn 4 default) is reproducible. + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(FetchError::Unverifiable(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0; the \ + cache-zip recipe is not reproducible for it" + ))); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(&bytes, &entry.name) + .map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, the \ + fetched tarball rebuilds to {actual}" + ))); + } + } + other => verify_integrity(&bytes, other)?, + } + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("package.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched tarball for {}@{} carries no package.json — not an npm package", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Stage a package from an on-disk vendored tarball (the fresh-clone +/// re-vendor path: the project has our committed artifact but no installed +/// copy). The bytes are verified against the LEDGER-recorded sha256 before +/// extraction — same fail-closed posture as the registry path; an entry +/// with no recorded hash is refused. +pub async fn stage_local_artifact( + tgz_path: &Path, + expected_sha256_hex: &str, +) -> Result { + if expected_sha256_hex.is_empty() { + return Err(FetchError::Unverifiable( + "the vendor ledger records no sha256 for the artifact".to_string(), + )); + } + let bytes = tokio::fs::read(tgz_path) + .await + .map_err(|e| FetchError::Failed(format!("cannot read {}: {e}", tgz_path.display())))?; + if bytes.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed(format!( + "{}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap", + tgz_path.display() + ))); + } + let actual = hex::encode(Sha256::digest(&bytes)); + if !actual.eq_ignore_ascii_case(expected_sha256_hex) { + return Err(FetchError::Failed(format!( + "{}: sha256 mismatch against the vendor ledger (recorded {expected_sha256_hex}, \ + on-disk bytes hash to {actual})", + tgz_path.display() + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create staging tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url: format!("file:{}", tgz_path.display()), + _tmp: tmp, + }) +} + +/// Capped download. http(s) only; the cap is enforced on the declared +/// Content-Length AND the actual stream (a lying server cannot blow past +/// it). +async fn download(client: &reqwest::Client, url: &str) -> Result, String> { + if !(url.starts_with("https://") || url.starts_with("http://")) { + return Err(format!("refusing non-http(s) artifact URL `{url}`")); + } + let mut resp = client + .get(url) + .send() + .await + .map_err(|e| format!("GET {url}: {e}"))?; + let status = resp.status(); + if !status.is_success() { + return Err(format!("GET {url}: HTTP {status}")); + } + if let Some(len) = resp.content_length() { + if len > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact is {len} bytes (cap {MAX_DOWNLOAD_BYTES})" + )); + } + } + let mut bytes: Vec = Vec::new(); + while let Some(chunk) = resp + .chunk() + .await + .map_err(|e| format!("reading {url}: {e}"))? + { + if bytes.len() as u64 + chunk.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap" + )); + } + bytes.extend_from_slice(&chunk); + } + Ok(bytes) +} + +/// Verify downloaded bytes against the lock-recorded verifier. Runs BEFORE +/// any disk write. Berry cache-zip checksums and go.sum dirhashes have +/// dedicated verifiers in their ecosystems' fetchers. +fn verify_integrity(bytes: &[u8], integrity: &LockIntegrity) -> Result<(), FetchError> { + match integrity { + LockIntegrity::Sri(sri) => verify_sri(bytes, sri).map_err(FetchError::Failed), + LockIntegrity::Sha1Hex(expect) => { + let actual = hex::encode(Sha1::digest(bytes)); + if &actual == expect { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha1 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::Sha256Hex(expect) => { + let actual = hex::encode(Sha256::digest(bytes)); + if actual.eq_ignore_ascii_case(expect) { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha256 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::BerryChecksum(_) | LockIntegrity::GoH1(_) => { + Err(FetchError::Unverifiable( + "verifier handled by a dedicated ecosystem fetcher".to_string(), + )) + } + LockIntegrity::None => Err(FetchError::Unverifiable( + "no integrity recorded".to_string(), + )), + } +} + +/// SRI verification: pick the strongest hash of a (possibly multi-hash, +/// whitespace-separated) SRI string and compare base64 digests. +fn verify_sri(bytes: &[u8], sri: &str) -> Result<(), String> { + let mut best: Option<(u8, &str, &str)> = None; + for token in sri.split_whitespace() { + let Some((algo, b64)) = token.split_once('-') else { + continue; + }; + let rank = match algo { + "sha512" => 3, + "sha384" => 2, + "sha256" => 1, + _ => continue, + }; + if best.map(|(r, _, _)| rank > r).unwrap_or(true) { + best = Some((rank, algo, b64)); + } + } + let Some((_, algo, expect)) = best else { + return Err(format!("no usable hash in SRI `{sri}`")); + }; + let b64 = base64::engine::general_purpose::STANDARD; + let actual = match algo { + "sha512" => b64.encode(Sha512::digest(bytes)), + "sha384" => b64.encode(Sha384::digest(bytes)), + _ => b64.encode(Sha256::digest(bytes)), + }; + if actual == expect { + Ok(()) + } else { + Err(format!( + "{algo} integrity mismatch: lockfile records {expect}, downloaded bytes hash to \ + {actual}" + )) + } +} + +/// Strip the FIRST path component (npm's tarball semantics — usually +/// `package/`, but registry tarballs may use any prefix dir). +fn strip_first_component(path: &Path) -> Option { + let mut components = path.components(); + components.next()?; + let rest = components.as_path(); + (!rest.as_os_str().is_empty()).then(|| rest.to_path_buf()) +} + +/// Traversal-guarded, mode-preserving tgz extraction (the same guard +/// family as `patch/package.rs::read_archive_to_map`, plus exec-bit +/// preservation: the deterministic re-pack reads modes from disk, so a +/// bytes-only extraction would silently strip bin scripts' exec bits). +/// Fails CLOSED on any traversal-shaped entry — a malicious tarball must +/// not half-extract. +fn extract_tgz(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ true) +} + +/// Like [`extract_tgz`] but keeps entry paths verbatim (gem `data.tar.gz` +/// archives carry package content at the root, no prefix dir). +#[allow(dead_code)] // used by the gem fetcher (feature-independent helper) +fn extract_tgz_no_strip(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ false) +} + +fn extract_tar_gz(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + use std::io::Read as _; + let gz = flate2::read::GzDecoder::new(bytes).take(MAX_TOTAL_DECOMPRESSED_BYTES); + let mut archive = tar::Archive::new(gz); + let mut count = 0usize; + for entry in archive + .entries() + .map_err(|e| format!("unreadable tarball: {e}"))? + { + let mut entry = entry.map_err(|e| format!("unreadable tarball entry: {e}"))?; + count += 1; + if count > MAX_ENTRIES { + return Err(format!("tarball exceeds {MAX_ENTRIES} entries")); + } + // Regular files only: symlinks/hardlinks/devices never extract + // (a symlink could redirect later entries out of the stage). + if !entry.header().entry_type().is_file() { + continue; + } + let raw = entry + .path() + .map_err(|e| format!("tarball entry has an undecodable path: {e}"))? + .into_owned(); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, // a bare prefix-level file — not package content + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "tarball entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + let size = entry.header().size().unwrap_or(u64::MAX); + if size > MAX_ENTRY_BYTES { + return Err(format!( + "tarball entry `{rel_str}` is {size} bytes (cap {MAX_ENTRY_BYTES})" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut entry, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = entry.header().mode().unwrap_or(0o644); + let perms = if mode & 0o111 != 0 { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use wiremock::matchers::{method, path as url_path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Build a gzipped tarball with the given `(path, bytes, exec)` entries. + fn make_tgz(entries: &[(&str, &[u8], bool)]) -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes, exec) in entries { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(if *exec { 0o755 } else { 0o644 }); + header.set_cksum(); + builder.append_data(&mut header, path, *bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() + } + + fn sri_of(bytes: &[u8]) -> String { + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) + } + + fn npm_entry(resolved: Option, integrity: LockIntegrity) -> LockfileEntry { + LockfileEntry { + ecosystem: "npm", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:npm/left-pad@1.3.0".into(), + resolved, + integrity, + } + } + + #[test] + fn tarball_url_forms() { + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "left-pad", "1.3.0"), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz" + ); + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "@scope/pkg", "2.0.0"), + "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "the scope stays in the path; the leaf uses the bare name" + ); + } + + #[test] + fn sri_picks_strongest_hash_and_compares() { + let bytes = b"hello"; + let good = sri_of(bytes); + assert!(verify_sri(bytes, &good).is_ok()); + // Multi-hash: a wrong sha256 alongside the right sha512 still passes + // (strongest wins), and vice versa fails. + let multi = format!("sha256-WRONG= {good}"); + assert!(verify_sri(bytes, &multi).is_ok()); + let bad = sri_of(b"other"); + assert!(verify_sri(bytes, &bad).is_err()); + assert!(verify_sri(bytes, "md5-abc=").is_err(), "unknown algos refuse"); + } + + #[tokio::test] + async fn fetch_verifies_sri_and_extracts_with_modes() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/bin/cli.js", b"#!/usr/bin/env node\n", true), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz.clone())) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(&tgz)), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + assert_eq!( + std::fs::read(fetched.dir().join("index.js")).unwrap(), + b"module.exports = 1;\n" + ); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = std::fs::metadata(fetched.dir().join("bin/cli.js")) + .unwrap() + .permissions() + .mode(); + assert_eq!(mode & 0o111, 0o111, "exec bit preserved"); + } + // The tempdir dies with the holder. + let dir = fetched.dir().to_path_buf(); + drop(fetched); + assert!(!dir.exists()); + } + + #[tokio::test] + async fn integrity_mismatch_fails_before_extraction() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"the lock expects different bytes")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => { + assert!(msg.contains("mismatch"), "{msg}") + } + other => panic!("expected integrity failure, got {other:?}"), + } + } + + #[tokio::test] + async fn unverifiable_entry_refuses_without_network() { + // A URL that would hard-fail if contacted — Unverifiable proves the + // decision happened before any I/O. + let entry = npm_entry( + Some("http://127.0.0.1:1/nope.tgz".into()), + LockIntegrity::None, + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => { + assert!(msg.contains("no integrity"), "{msg}") + } + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn http_error_and_scheme_guard_fail_closed() { + let mock = MockServer::start().await; + // No mounted route → 404. + let entry = npm_entry( + Some(format!("{}/missing.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("404"), "{msg}"), + other => panic!("expected HTTP failure, got {other:?}"), + } + + let entry = npm_entry( + Some("ftp://example.com/x.tgz".into()), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("non-http"), "{msg}"), + other => panic!("expected scheme refusal, got {other:?}"), + } + } + + #[test] + fn extraction_strips_first_component_whatever_its_name() { + let tgz = make_tgz(&[("weird-prefix/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + extract_tgz(&tgz, tmp.path()).unwrap(); + assert!(tmp.path().join("package.json").is_file()); + } + + #[test] + fn traversal_entries_fail_closed() { + // The tar crate refuses to WRITE `..` paths, so craft the header + // name bytes directly — exactly what a hostile tarball would carry. + for evil in ["package/../../escape.js", "package/x/../../../up.js"] { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + { + let name = &mut header.as_gnu_mut().unwrap().name; + name[..evil.len()].copy_from_slice(evil.as_bytes()); + } + header.set_size(4); + header.set_mode(0o644); + header.set_cksum(); + builder.append(&header, &b"evil"[..]).unwrap(); + let tgz = builder.into_inner().unwrap().finish().unwrap(); + + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&tgz, tmp.path()).unwrap_err(); + assert!(err.contains("escapes"), "{evil}: {err}"); + assert!( + std::fs::read_dir(tmp.path()).unwrap().next().is_none(), + "nothing may extract from a traversal-bearing tarball" + ); + } + } + + #[tokio::test] + async fn berry_checksum_verifies_via_cache_zip_rebuild() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let expected = + super::super::berry_zip::berry_cache_checksum_10c0(&tgz, "left-pad").unwrap(); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(expected), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + + // Tampered checksum → Failed; foreign cacheKey → Unverifiable. + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("10c0/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("9/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("cacheKey"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn stage_local_artifact_verifies_ledger_sha256() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + let tgz_path = tmp.path().join("left-pad-1.3.0.tgz"); + std::fs::write(&tgz_path, &tgz).unwrap(); + let sha = hex::encode(Sha256::digest(&tgz)); + + let staged = stage_local_artifact(&tgz_path, &sha).await.unwrap(); + assert!(staged.dir().join("package.json").is_file()); + + match stage_local_artifact(&tgz_path, &"0".repeat(64)).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected ledger mismatch, got {other:?}"), + } + match stage_local_artifact(&tgz_path, "").await { + Err(FetchError::Unverifiable(_)) => {} + other => panic!("expected Unverifiable for empty hash, got {other:?}"), + } + } + + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_crate_fetch_verifies_sha256_and_extracts() { + // .crate = tar.gz with a {name}-{version}/ top dir. + let crate_bytes = make_tgz(&[ + ("left-pad-1.3.0/Cargo.toml", b"[package]\nname = \"left-pad\"\n", false), + ("left-pad-1.3.0/src/lib.rs", b"pub fn pad() {}\n", false), + ]); + let sha = hex::encode(Sha256::digest(&crate_bytes)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/left-pad-1.3.0.crate")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(crate_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "cargo", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:cargo/left-pad@1.3.0".into(), + resolved: Some(format!("{}/left-pad/left-pad-1.3.0.crate", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("Cargo.toml").is_file()); + assert!(fetched.dir().join("src/lib.rs").is_file()); + + // Tampered checksum fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + /// Build a go module zip in memory (files only, `module@version/` + /// prefix — the go zip layout). + #[cfg(feature = "golang")] + fn make_module_zip(prefix: &str, files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + format!("{prefix}{name}"), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + /// Independent spec-mirror of dirhash Hash1/HashZip, structured + /// differently from the production fn to catch encoding slips. + #[cfg(feature = "golang")] + fn spec_h1(files: &[(&str, &[u8])], prefix: &str) -> String { + // dirhash.Hash1 sorts the FILE NAMES, then emits one line per file. + let mut named: Vec<(String, &[u8])> = files + .iter() + .map(|(name, bytes)| (format!("{prefix}{name}"), *bytes)) + .collect(); + named.sort_by(|a, b| a.0.cmp(&b.0)); + let lines: Vec = named + .iter() + .map(|(name, bytes)| format!("{} {name}\n", hex::encode(Sha256::digest(bytes)))) + .collect(); + let digest = Sha256::digest(lines.concat().as_bytes()); + format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(digest) + ) + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn golang_module_fetch_verifies_h1_dirhash_and_extracts() { + // Out-of-order files prove the sort; nested module path proves the + // explicit-prefix strip (a first-component strip would be wrong). + let prefix = "github.com/x/y@v1.0.0/"; + let files: [(&str, &[u8]); 3] = [ + ("go.mod", b"module github.com/x/y\n"), + ("a/b.go", b"package a\n"), + ("README.md", b"# y\n"), + ]; + let zip_bytes = make_module_zip(prefix, &files); + let expected = spec_h1(&files, prefix); + assert_eq!( + go_h1_of_zip(&zip_bytes).unwrap(), + expected, + "production dirhash matches the spec mirror" + ); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/github.com/x/y/@v/v1.0.0.zip")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(zip_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: Some(format!("{}/github.com/x/y/@v/v1.0.0.zip", mock.uri())), + integrity: LockIntegrity::GoH1(expected), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("go.mod").is_file()); + assert!(fetched.dir().join("a/b.go").is_file()); + + // Tampered h1 fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::GoH1("h1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=".into()), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[cfg(feature = "golang")] + #[test] + fn go_escape_uppercase_and_zip_prefix_guards() { + assert_eq!(go_escape("github.com/Azure/azure-sdk"), "github.com/!azure/azure-sdk"); + assert_eq!(go_escape("v1.0.0-RC1"), "v1.0.0-!r!c1"); + + // An entry outside the module prefix fails the whole artifact. + let zip_bytes = make_module_zip("github.com/x/y@v1.0.0/", &[("go.mod", b"m\n")]); + let tmp = tempfile::tempdir().unwrap(); + let err = extract_zip_with_prefix(&zip_bytes, tmp.path(), "github.com/OTHER@v1/") + .unwrap_err(); + assert!(err.contains("outside"), "{err}"); + } + + /// Build a zip with the given `(path, bytes)` entries. + fn make_zip(files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + name.to_string(), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_dist_fetch_verifies_sha1_and_strips_top_dir() { + // GitHub zipballs carry an `owner-repo-sha/` top dir. + let zip_bytes = make_zip(&[ + ("Seldaek-monolog-abc123/composer.json", br#"{"name":"monolog/monolog"}"#), + ("Seldaek-monolog-abc123/src/Logger.php", b" assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[tokio::test] + async fn gem_fetch_verifies_sha256_and_extracts_data_tar() { + // .gem = plain tar holding data.tar.gz (content at the ROOT — no + // prefix dir) + metadata.gz. + let data_tgz = make_tgz(&[ + ("lib/rails.rb", b"module Rails; end\n", false), + ("README.md", b"# rails\n", false), + ]); + let mut outer = tar::Builder::new(Vec::new()); + for (name, bytes) in [("metadata.gz", b"meta".as_slice()), ("data.tar.gz", &data_tgz)] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + outer.append_data(&mut header, name, bytes).unwrap(); + } + let gem_bytes = outer.into_inner().unwrap(); + let sha = hex::encode(Sha256::digest(&gem_bytes)); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/downloads/rails-7.1.0.gem")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(gem_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "gem", + name: "rails".into(), + version: "7.1.0".into(), + purl: "pkg:gem/rails@7.1.0".into(), + resolved: Some(format!("{}/downloads/rails-7.1.0.gem", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!( + fetched.dir().join("lib/rails.rb").is_file(), + "data.tar.gz content extracts at the root (no strip)" + ); + assert!(fetched.dir().join("README.md").is_file()); + } + + #[tokio::test] + async fn pypi_wheel_fetch_extracts_site_packages_layout() { + let wheel = make_zip(&[ + ("requests/__init__.py", b"__version__ = '2.28.0'\n"), + ( + "requests-2.28.0.dist-info/RECORD", + b"requests/__init__.py,sha256=abc,24\n", + ), + ("requests-2.28.0.dist-info/WHEEL", b"Wheel-Version: 1.0\n"), + ]); + let sha = hex::encode(Sha256::digest(&wheel)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/packages/requests-2.28.0-py3-none-any.whl")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(wheel)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "pypi", + name: "requests".into(), + version: "2.28.0".into(), + purl: "pkg:pypi/requests@2.28.0".into(), + resolved: Some(format!( + "{}/packages/requests-2.28.0-py3-none-any.whl", + mock.uri() + )), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + // Wheel content at the root: a site-packages-shaped dir with the + // dist-info RECORD the pypi vendor backend stages from. + assert!(fetched.dir().join("requests/__init__.py").is_file()); + assert!(fetched + .dir() + .join("requests-2.28.0.dist-info/RECORD") + .is_file()); + + // No recorded wheel URL (poetry/requirements) → Unverifiable. + let entry = LockfileEntry { + resolved: None, + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("wheel"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[test] + fn oversized_entry_header_fails_closed() { + // A header CLAIMING more than the per-entry cap fails before any + // attempt to read that much data. + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + header.set_path("package/huge.bin").unwrap(); + header.set_size(MAX_ENTRY_BYTES + 1); + header.set_mode(0o644); + header.set_cksum(); + // Intentionally append no data: the size check fires first. + let inner = { + use std::io::Write as _; + builder.get_mut().write_all(&header.as_bytes()[..]).unwrap(); + builder.into_inner().unwrap().finish().unwrap() + }; + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&inner, tmp.path()).unwrap_err(); + assert!( + err.contains("cap") || err.contains("unreadable"), + "oversize header fails closed: {err}" + ); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index f4c7ecc..2dd101b 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -898,7 +898,7 @@ fn carried_sections(lines: &[String]) -> Vec { } /// Read a berry scalar field (`: `, value possibly quoted). -fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { +pub(super) fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { for line in lines.iter().skip(1) { let Some(rest) = body_field_line(line) else { continue; diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs index fb25126..6278bb8 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs @@ -555,7 +555,7 @@ fn rewrite_classic_block( /// Does this block's `resolved` already point into `.socket/vendor/npm/` /// (ours — current or stale uuid)? -fn block_points_into_vendor(lines: &[String]) -> bool { +pub(super) fn block_points_into_vendor(lines: &[String]) -> bool { classic_field(lines, "resolved") .and_then(parse_vendor_path) .is_some_and(|p| p.eco == "npm")