diff --git a/explorer.qmd b/explorer.qmd index dbd645b..cd45be4 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -802,6 +802,14 @@ membership_url = `${R2_BASE}/isamples_202608_sample_facet_membership.parquet` // so global-view cross-filtered tree counts are instant instead of a live // membership near-full-scan. ~1k rows; same schema as facet_cross_filter. tree_cross_filter_url = `${R2_BASE}/isamples_202608_facet_tree_cross_filter.parquet` +// #293 bitmask filter: per-pid tree-membership masks + the concept_uri→bit map. +// facetFilterSQL filters broad multi-tree selections with a columnar bitwise +// predicate over sample_facet_masks (~10 MB, one row/pid) instead of the +// 39M-row membership GROUP BY that stalls DuckDB-WASM. node_bits (~56 rows) +// maps each selected node to its bit. Best-effort: if either fails to load, +// facetFilterSQL falls back to the membership scan (no regression). +masks_url = `${R2_BASE}/isamples_202608_sample_facet_masks.parquet` +node_bits_url = `${R2_BASE}/isamples_202608_facet_node_bits.parquet` // Canonical palette — see issue #113. Path-relative so this works under // both isamples.org (custom domain at root) and project-pages fork @@ -1114,6 +1122,31 @@ function syncFacetNote() { // without a JOIN and avoids duplicate rows from multi-valued facets // (a sample with two materials would appear twice via JOIN). Required // for Phase 4's table mode and any non-JOIN caller. See issue #156. +// #293: build one "(_mask & ) <> 0" predicate per active tree dim +// from the concept_uri→bit map (window.__nodeBits). The mask is a BigInt OR of the +// selected nodes' bits; sample_facet_masks already encodes the ancestor closure, so +// selecting a parent matches its whole subtree. Returns [] (→ caller uses the +// membership fallback) if the bit map isn't loaded, a selected node has no known +// bit, or a dim's mask is empty — never silently under-filter. +function treeMaskClauses(activeTree) { + const bits = (typeof window !== 'undefined') ? window.__nodeBits : null; + if (!bits) return []; + const out = []; + for (const { key, sel } of activeTree) { + const dimBits = bits[key]; + if (!dimBits) return []; + let mask = 0n; + for (const uri of sel) { + const idx = dimBits.get(uri); + if (idx === undefined) return []; // unknown node → fall back + mask |= (1n << BigInt(idx)); + } + if (mask === 0n) return []; // empty → avoid matching nothing + out.push(`(${key}_mask & ${mask}) <> 0`); // BigInt → decimal BIGINT literal + } + return out; +} + function facetFilterSQL() { // Each entry is a standalone `pid IN (...)` predicate; multiple are AND-ed. const parts = []; @@ -1123,22 +1156,33 @@ function facetFilterSQL() { // under every ancestor), so a selected parent matches its whole subtree (no // client-side descendant expansion). A flat (non-tree) dim filters on facets_v3. const treeClauses = []; // one "(facet_type='X' AND concept_uri IN(...))" per tree dim + const activeTree = []; // [{key, sel}] for tree-active dims (for the bitmask path) for (const key of TREE_DIM_KEYS) { const sel = treeSelection(key); if (sel.length === 0) continue; const list = sel.map(s => `'${escSql(s)}'`).join(','); if (treeActive(key)) { treeClauses.push(`(facet_type='${key}' AND concept_uri IN (${list}))`); + activeTree.push({ key, sel }); } else { facetsConds.push(`${key} IN (${list})`); } } - // #293: collapse the tree-dim selections into ONE membership scan rather than - // one AND-ed subquery per dim (N full scans → 1 in DuckDB-WASM). AND across dims - // is `HAVING COUNT(DISTINCT facet_type) = <#dims>`; OR within a dim is the - // `concept_uri IN (...)`. Single-dim collapses to the same one scan as before. - if (treeClauses.length > 0) { - parts.push(`pid IN (SELECT pid FROM read_parquet('${membership_url}') WHERE ${treeClauses.join(' OR ')} GROUP BY pid HAVING COUNT(DISTINCT facet_type) = ${treeClauses.length})`); + // #293: prefer the bitmask predicate over sample_facet_masks — a single + // columnar scan with no GROUP BY — which is set-identical to the membership + // form but avoids the 39M-row scan that stalls DuckDB-WASM on broad multi-tree + // selections. Fall back to the collapsed membership scan when the bit map + // isn't available or a selected node has no bit (treeMaskClauses → []), so + // results are always correct even before the mask artifacts are published. + if (activeTree.length > 0) { + const maskClauses = treeMaskClauses(activeTree); + if (maskClauses.length === activeTree.length) { + parts.push(`pid IN (SELECT pid FROM read_parquet('${masks_url}') WHERE ${maskClauses.join(' AND ')})`); + } else { + // Fallback: collapse the tree-dim selections into ONE membership scan + // (OR within, HAVING COUNT(DISTINCT facet_type)=N across). + parts.push(`pid IN (SELECT pid FROM read_parquet('${membership_url}') WHERE ${treeClauses.join(' OR ')} GROUP BY pid HAVING COUNT(DISTINCT facet_type) = ${treeClauses.length})`); + } } if (facetsConds.length > 0) { parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${facets_url}') WHERE ${facetsConds.join(' AND ')})`); @@ -1634,6 +1678,56 @@ db = { } ``` +```{ojs} +//| echo: false +//| output: false + +// #293: load the concept_uri→bit map once at startup so facetFilterSQL can turn +// a tree-node selection into a bitmask and filter sample_facet_masks with a cheap +// columnar predicate (instead of the 39M-row membership GROUP BY). Stashed on +// window for synchronous access from facetFilterSQL (same pattern as +// window.conceptLabelForUri). Best-effort: on ANY failure window.__nodeBits stays +// null and facetFilterSQL falls back to the membership scan — so this is safe to +// ship before sample_facet_masks / facet_node_bits are published. +nodeBitsReady = { + window.__nodeBits = null; + try { + const rows = await db.query( + `SELECT facet_type, concept_uri, bit_index, build_id FROM read_parquet('${node_bits_url}')`); + const map = { material: new Map(), context: new Map(), object_type: new Map() }; + const nbBuilds = new Set(); + for (const r of rows) { + if (map[r.facet_type]) map[r.facet_type].set(r.concept_uri, Number(r.bit_index)); + nbBuilds.add(r.build_id); + } + const haveBits = ['material', 'context', 'object_type'].every(k => map[k].size > 0); + // node_bits must carry exactly ONE build_id (a mixed file is corrupt; don't + // let a last-row-wins value coincidentally match masks — Codex r2). + if (!haveBits || nbBuilds.size !== 1) return false; + const nbBuild = [...nbBuilds][0]; + // Codex P1.1 + P1.2: PREFLIGHT the masks file AND require its build_id to + // match node_bits. This proves masks is present/readable (else the mask + // query path would fail with no retry) and that both artifacts are from the + // SAME generation (else a stale-cached masks file would map bits wrong). + // Only then advertise readiness; otherwise facetFilterSQL uses membership. + const mrows = await db.query( + `SELECT DISTINCT build_id FROM read_parquet('${masks_url}')`); + const maskBuilds = mrows.map(r => r.build_id); + if (maskBuilds.length !== 1 || maskBuilds[0] !== nbBuild) { + console.warn('masks/node_bits build_id mismatch or missing; using membership fallback', + { nbBuild, maskBuilds }); + return false; + } + window.__nodeBits = map; + return true; + } catch (err) { + console.warn('node_bits/masks preflight failed; facetFilterSQL will use the membership fallback:', err); + window.__nodeBits = null; + return false; + } +} +``` + ```{ojs} //| echo: false diff --git a/scripts/build_frontend_derived.py b/scripts/build_frontend_derived.py index 2b63321..6101d72 100755 --- a/scripts/build_frontend_derived.py +++ b/scripts/build_frontend_derived.py @@ -64,7 +64,11 @@ ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries", "facet_summaries", "facet_cross_filter", "wide_h3", "sample_facet_membership", "facet_tree_summaries", - "facet_tree_cross_filter"] + "facet_tree_cross_filter", "facet_node_bits", "sample_facet_masks"] +# #293: max tree nodes per dim that fit in a signed BIGINT mask (bits 0..62). +# Live max is 22 (context); guard so a future vocab explosion fails loudly +# instead of silently overflowing a mask bit. +MASK_MAX_BITS = 63 # Shared SQL expression for sample_facets_v2.description (#277 part 2). # Appends space-joined concept labels (IC labels across all 4 concept dims) @@ -481,6 +485,72 @@ def build_facet_tree_cross_filter(con, out): ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") +def membership_build_id(con): + # #293 (Codex P1, r2): a fingerprint of the FULL membership generation — not + # just the node set. Both node_bits (positional bit assignment) and masks are + # pure functions of `membership`, so hashing membership content captures every + # change that would alter either artifact (new/dropped pids, re-mapped concepts, + # AND node-set changes). Embedding this id in both lets the explorer refuse the + # mask path unless the two are from the SAME generation (guards a stale-cached + # masks file). Order-independent XOR of per-row hashes (membership grain is + # unique per (pid,facet_type,concept_uri) — validated — so no XOR cancellation). + return con.sql(""" + SELECT CAST(COALESCE(bit_xor(hash(pid || chr(31) || facet_type || chr(31) || concept_uri)), 0) + AS VARCHAR) + FROM membership""").fetchone()[0] + + +def build_facet_node_bits(con, out, build_id): + # #293: authoritative concept_uri -> bit_index assignment per tree dim. The + # explorer loads this to turn a node selection into a bitmask and filter + # sample_facet_masks with a cheap columnar bitwise predicate (replacing the + # 39M-row membership GROUP BY that hits the DuckDB-WASM data-scale wall on + # broad multi-tree selections). bit_index is 0-based, DETERMINISTIC (dense + # rank over distinct concept_uri per facet_type, ordered by URI). The mask + # builder below uses the SAME assignment so they can never drift. We HARD-fail + # if any dim exceeds MASK_MAX_BITS (a signed-BIGINT mask can't hold it). + over = con.sql(""" + SELECT facet_type, MAX(bit_index)+1 AS n FROM ( + SELECT facet_type, concept_uri, + (ROW_NUMBER() OVER (PARTITION BY facet_type ORDER BY concept_uri) - 1) AS bit_index + FROM (SELECT DISTINCT facet_type, concept_uri FROM membership) + ) GROUP BY facet_type HAVING MAX(bit_index)+1 > ?""", params=[MASK_MAX_BITS]).fetchall() + if over: + raise SystemExit(f"FATAL: tree dim(s) exceed {MASK_MAX_BITS} nodes — bitmask overflow: {over}") + con.execute(f"""COPY ( + SELECT facet_type, concept_uri, + (ROW_NUMBER() OVER (PARTITION BY facet_type ORDER BY concept_uri) - 1)::INTEGER AS bit_index, + '{build_id}' AS build_id + FROM (SELECT DISTINCT facet_type, concept_uri FROM membership) + ORDER BY facet_type, concept_uri + ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + + +def build_sample_facet_masks(con, out, build_id): + # #293: one row per located pid that has ANY tree membership; a BIGINT mask + # per tree dim where bit (1<) <> 0 AND (context_mask & ) <> 0 ... + # which is set-identical to the membership pid-subquery but a single columnar + # scan (no 39M-row scan, no GROUP BY pid). bit assignment == build_facet_node_bits. + con.execute(f"""COPY ( + WITH nb AS ( + SELECT facet_type, concept_uri, + (1::BIGINT << (ROW_NUMBER() OVER (PARTITION BY facet_type ORDER BY concept_uri) - 1)) AS bitval + FROM (SELECT DISTINCT facet_type, concept_uri FROM membership) + ) + SELECT m.pid, + COALESCE(bit_or(CASE WHEN m.facet_type='material' THEN nb.bitval END), 0)::BIGINT AS material_mask, + COALESCE(bit_or(CASE WHEN m.facet_type='context' THEN nb.bitval END), 0)::BIGINT AS context_mask, + COALESCE(bit_or(CASE WHEN m.facet_type='object_type' THEN nb.bitval END), 0)::BIGINT AS object_type_mask, + '{build_id}' AS build_id + FROM membership m JOIN nb ON nb.facet_type=m.facet_type AND nb.concept_uri=m.concept_uri + GROUP BY m.pid + ORDER BY m.pid + ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + + def file_meta(con, path): n = con.sql(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchone()[0] schema = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM read_parquet('{path}')").fetchall()] @@ -544,11 +614,13 @@ def emit(name, fn): emit("wide_h3", lambda o: build_wide_h3(con, args.wide, o)) # Hierarchy artifacts (#281/#282) — need vocab_labels for the SKOS tree. - if want("sample_facet_membership") or want("facet_tree_summaries") or want("facet_tree_cross_filter"): + HIER_ARTIFACTS = {"sample_facet_membership", "facet_tree_summaries", + "facet_tree_cross_filter", "facet_node_bits", "sample_facet_masks"} + if any(want(a) for a in HIER_ARTIFACTS): if not args.vocab_labels: # Fail loud if the user EXPLICITLY asked for a hierarchy artifact # (Codex) — silently skipping an explicit --only target is wrong. - explicit = only & {"sample_facet_membership", "facet_tree_summaries", "facet_tree_cross_filter"} + explicit = only & HIER_ARTIFACTS if explicit: sys.exit(f"FATAL: --only {sorted(explicit)} requires --vocab-labels ") log("SKIP hierarchy artifacts: pass --vocab-labels ", t0) @@ -558,6 +630,13 @@ def emit(name, fn): emit("facet_tree_summaries", lambda o: build_facet_tree_summaries(con, o)) # #290/#293 cross-filter cube — needs membership (above) + samp_geo (source). emit("facet_tree_cross_filter", lambda o: build_facet_tree_cross_filter(con, o)) + # #293 bitmask filter artifacts — needs membership (above). node_bits + # and masks share a node-set build_id so the explorer only uses the mask + # path when the two are from the same generation (Codex P1). + if want("facet_node_bits") or want("sample_facet_masks"): + _bid = membership_build_id(con) + emit("facet_node_bits", lambda o: build_facet_node_bits(con, o, _bid)) + emit("sample_facet_masks", lambda o: build_sample_facet_masks(con, o, _bid)) if not args.no_manifest: log("hashing inputs/outputs for manifest…", t0) diff --git a/scripts/validate_frontend_derived.py b/scripts/validate_frontend_derived.py index b2d041a..c629206 100755 --- a/scripts/validate_frontend_derived.py +++ b/scripts/validate_frontend_derived.py @@ -53,6 +53,8 @@ def main(): ap.add_argument("--tree-summaries", help="facet_tree_summaries parquet (#281/#282); optional") ap.add_argument("--membership", help="sample_facet_membership parquet (#281/#282); optional") ap.add_argument("--tree-cross-filter", help="facet_tree_cross_filter parquet (#290/#293); optional") + ap.add_argument("--node-bits", help="facet_node_bits parquet (#293); optional") + ap.add_argument("--masks", help="sample_facet_masks parquet (#293); optional") ap.add_argument("--wide", help="source wide parquet — enables the SEMANTIC gate " "(re-derive and diff the written files against a fresh build)") ap.add_argument("--min-rows", type=int, default=1_000_000, @@ -295,6 +297,8 @@ def _opt(name, attr): tree = _opt("facet_tree_summaries", "tree_summaries") mem = _opt("sample_facet_membership", "membership") treexf = _opt("facet_tree_cross_filter", "tree_cross_filter") + nodebits = _opt("facet_node_bits", "node_bits") + masks = _opt("sample_facet_masks", "masks") if tree: T = f"read_parquet('{tree}')" # parent ≥ child for every edge, every dim (distinct-pid UNION semantics — @@ -396,6 +400,82 @@ def _opt(name, attr): check("tree_cross_filter baseline == tree_summaries", bmm == 0, f"{bmm} baseline tree rows disagree with facet_tree_summaries") + # --- #293 bitmask filter artifacts (facet_node_bits + sample_facet_masks) --- + # The explorer filters broad multi-tree selections with a columnar bitwise + # predicate over sample_facet_masks instead of a 39M-row membership GROUP BY. + # These checks prove the masks are SET-IDENTICAL to membership (so the filter + # results can't differ from the old path) and node_bits is a clean assignment. + if nodebits: + NB = f"read_parquet('{nodebits}')" + if mem: + M = f"read_parquet('{mem}')" + # node_bits covers EXACTLY the distinct (facet_type, concept_uri) in membership + cov = scalar(f"""SELECT + (SELECT COUNT(*) FROM (SELECT DISTINCT facet_type, concept_uri FROM {M} + EXCEPT SELECT facet_type, concept_uri FROM {NB})) + + (SELECT COUNT(*) FROM (SELECT facet_type, concept_uri FROM {NB} + EXCEPT SELECT DISTINCT facet_type, concept_uri FROM {M}))""") + check("node_bits covers exactly membership nodes", cov == 0, f"{cov} node(s) differ from membership") + # bit_index dense 0..N-1, unique, and within signed-BIGINT range per dim + bad = scalar(f"""SELECT COUNT(*) FROM ( + SELECT facet_type, COUNT(*) AS n, MIN(bit_index) AS lo, MAX(bit_index) AS hi, + COUNT(DISTINCT bit_index) AS d + FROM {NB} GROUP BY facet_type + HAVING lo<>0 OR hi<>n-1 OR d<>n OR hi>62)""") + check("node_bits: dense unique 0..N-1 per dim (<=62)", bad == 0, f"{bad} dim(s) with a bad bit range") + if masks: + X = f"read_parquet('{masks}')" + mdup = scalar(f"SELECT COUNT(*) FROM (SELECT pid FROM {X} GROUP BY pid HAVING COUNT(*)>1)") + check("masks: one row per pid", mdup == 0, f"{mdup} duplicate pids in masks") + # build_id must be a single value AND match node_bits (Codex P1): the + # explorer enables the mask path only when these agree, so a mismatch here + # is a build error that would silently disable the fast path in prod. + mbids = scalar(f"SELECT COUNT(DISTINCT build_id) FROM {X}") + check("masks: single build_id", mbids == 1, f"{mbids} distinct build_ids in masks (want 1)") + if nodebits: + nb_bid = scalar(f"SELECT COUNT(DISTINCT build_id) FROM read_parquet('{nodebits}')") + check("node_bits: single build_id", nb_bid == 1, f"{nb_bid} distinct build_ids in node_bits") + # Only compare when both are single-valued, else the scalar subqueries + # below return multiple rows and throw (Codex r2). Use MIN/MAX agg so a + # multi-valued file degrades to a clean FAIL, not an exception. + if nb_bid == 1 and mbids == 1: + same = scalar(f"SELECT (SELECT MIN(build_id) FROM read_parquet('{nodebits}')) " + f"= (SELECT MIN(build_id) FROM {X})") + check("node_bits/masks build_id match", bool(same), + "build_id differs between node_bits and masks (mixed generations)") + else: + check("node_bits/masks build_id match", False, + "cannot compare — an artifact has multiple build_ids") + if mem and nodebits: + M = f"read_parquet('{mem}')" + NB = f"read_parquet('{nodebits}')" + # SEMANTIC gate: re-derive masks from the WRITTEN membership + node_bits + # (independent of the builder's internal ROW_NUMBER) and diff symmetric. + ref = (f"WITH nb AS (SELECT facet_type, concept_uri, (1::BIGINT << bit_index) AS bitval FROM {NB}) " + f"SELECT m.pid, " + f"COALESCE(bit_or(CASE WHEN m.facet_type='material' THEN nb.bitval END),0)::BIGINT material_mask, " + f"COALESCE(bit_or(CASE WHEN m.facet_type='context' THEN nb.bitval END),0)::BIGINT context_mask, " + f"COALESCE(bit_or(CASE WHEN m.facet_type='object_type' THEN nb.bitval END),0)::BIGINT object_type_mask " + f"FROM {M} m JOIN nb ON nb.facet_type=m.facet_type AND nb.concept_uri=m.concept_uri GROUP BY m.pid") + fil = f"SELECT pid, material_mask, context_mask, object_type_mask FROM {X}" + mm = scalar(f"SELECT (SELECT COUNT(*) FROM (({ref}) EXCEPT ({fil}))) + (SELECT COUNT(*) FROM (({fil}) EXCEPT ({ref})))") + check("masks == re-derived from membership+node_bits", mm == 0, f"{mm} mask rows disagree") + # CROSS-CHECK the actual filter semantics on a real node per dim: the + # bitwise predicate must return the SAME pid set as the membership + # subquery. Proves the masks are usable as a drop-in, not just equal blobs. + for dim in ("material", "context", "object_type"): + node = scalar(f"SELECT concept_uri FROM {M} WHERE facet_type='{dim}' " + f"GROUP BY 1 ORDER BY COUNT(*) DESC, 1 LIMIT 1") + if node is None: + continue + bitval = scalar(f"SELECT (1::BIGINT << bit_index) FROM {NB} WHERE facet_type='{dim}' AND concept_uri='{node}'") + col = f"{dim}_mask" + d = scalar(f"""WITH a AS (SELECT pid FROM {M} WHERE facet_type='{dim}' AND concept_uri='{node}'), + b AS (SELECT pid FROM {X} WHERE ({col} & {bitval})<>0) + SELECT (SELECT COUNT(*) FROM (SELECT * FROM a EXCEPT SELECT * FROM b)) + + (SELECT COUNT(*) FROM (SELECT * FROM b EXCEPT SELECT * FROM a))""") + check(f"masks filter == membership ({dim})", d == 0, f"{d} pids differ for a real {dim} node") + print(f"\n{'CHECK':<44} {'RESULT':<6} DETAIL\n" + "-" * 90) ok = True for name, passed, detail in R: diff --git a/tests/test_frontend_derived.py b/tests/test_frontend_derived.py index 97ae6ec..aa01018 100644 --- a/tests/test_frontend_derived.py +++ b/tests/test_frontend_derived.py @@ -450,6 +450,75 @@ def test_tree_cross_filter_grain_gate_bites(tmp_path): f"grain gate failed to catch a doubled cube:\n{v.stdout}" +def test_facet_masks_filter_equals_membership(tmp_path): + """#293: the bitmask filter must be SET-IDENTICAL to the membership subquery + for every tree node, and node_bits must be a dense unique assignment.""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + assert _build_tree(tmp_path, wide, vocab).returncode == 0 + con = duckdb.connect() + mem = f"read_parquet('{tmp_path / 't_sample_facet_membership.parquet'}')" + nb = f"read_parquet('{tmp_path / 't_facet_node_bits.parquet'}')" + masks = f"read_parquet('{tmp_path / 't_sample_facet_masks.parquet'}')" + # node_bits dense 0..N-1 per dim, unique + bad = con.sql(f"""SELECT COUNT(*) FROM ( + SELECT facet_type, COUNT(*) n, MIN(bit_index) lo, MAX(bit_index) hi, COUNT(DISTINCT bit_index) d + FROM {nb} GROUP BY facet_type HAVING lo<>0 OR hi<>n-1 OR d<>n)""").fetchone()[0] + assert bad == 0 + # for every tree node, bitmask predicate == membership pid set + for dim, node in con.sql(f"SELECT DISTINCT facet_type, concept_uri FROM {mem}").fetchall(): + bitval = con.sql(f"SELECT (1::BIGINT << bit_index) FROM {nb} WHERE facet_type='{dim}' AND concept_uri='{node}'").fetchone()[0] + col = f"{dim}_mask" + d = con.sql(f"""WITH a AS (SELECT pid FROM {mem} WHERE facet_type='{dim}' AND concept_uri='{node}'), + b AS (SELECT pid FROM {masks} WHERE ({col} & {bitval})<>0) + SELECT (SELECT COUNT(*) FROM (SELECT * FROM a EXCEPT SELECT * FROM b)) + + (SELECT COUNT(*) FROM (SELECT * FROM b EXCEPT SELECT * FROM a))""").fetchone()[0] + assert d == 0, f"{dim} node {node}: bitmask filter != membership ({d} pids differ)" + + +def test_facet_masks_validator_gate_bites(tmp_path): + """Corrupting a mask must fail the validator's re-derivation gate.""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + assert _build_tree(tmp_path, wide, vocab).returncode == 0 + masks = str(tmp_path / "t_sample_facet_masks.parquet") + con = duckdb.connect(); tmp_m = masks + ".tmp" + con.execute(f"""COPY (SELECT pid, material_mask + 1 AS material_mask, context_mask, object_type_mask, build_id + FROM read_parquet('{masks}')) TO '{tmp_m}' (FORMAT PARQUET)"""); con.close(); os.replace(tmp_m, masks) + v = subprocess.run([sys.executable, VALIDATE, "--dir", str(tmp_path), "--tag", "t", "--min-rows", "1"], + capture_output=True, text=True) + assert v.returncode != 0 and "masks ==" in v.stdout, f"masks gate failed to catch corruption:\n{v.stdout}" + + +def test_facet_masks_build_id_mismatch_caught(tmp_path): + """Codex P1.1: a masks file from a different generation than node_bits (different + build_id) must fail validation — that mismatch silently disables the fast path.""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + assert _build_tree(tmp_path, wide, vocab).returncode == 0 + masks = str(tmp_path / "t_sample_facet_masks.parquet") + con = duckdb.connect(); tmp_m = masks + ".tmp" + con.execute(f"""COPY (SELECT pid, material_mask, context_mask, object_type_mask, + 'STALE_GENERATION' AS build_id FROM read_parquet('{masks}')) + TO '{tmp_m}' (FORMAT PARQUET)"""); con.close(); os.replace(tmp_m, masks) + v = subprocess.run([sys.executable, VALIDATE, "--dir", str(tmp_path), "--tag", "t", "--min-rows", "1"], + capture_output=True, text=True) + assert v.returncode != 0 and "build_id match" in v.stdout, \ + f"build_id mismatch gate failed to catch a stale-generation masks file:\n{v.stdout}" + + +def test_facet_masks_only_builds(tmp_path): + """--only sample_facet_masks (and facet_node_bits) must produce the files.""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + r = subprocess.run([sys.executable, BUILD, "--wide", wide, "--outdir", str(tmp_path), "--tag", "t", + "--no-manifest", "--vocab-labels", vocab, + "--only", "facet_node_bits,sample_facet_masks"], capture_output=True, text=True) + assert r.returncode == 0, f"{r.stdout}\n{r.stderr}" + assert (tmp_path / "t_facet_node_bits.parquet").exists() + assert (tmp_path / "t_sample_facet_masks.parquet").exists() + + def test_scheme_corruption_caught(tmp_path): wide = str(tmp_path / "wide.parquet"); build_fixture_wide(wide, "blob") assert _build(tmp_path, wide).returncode == 0