Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 100 additions & 6 deletions explorer.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,14 @@ membership_url = `${R2_BASE}/isamples_202608_sample_facet_membership.parquet`
// so global-view cross-filtered tree counts are instant instead of a live
// membership near-full-scan. ~1k rows; same schema as facet_cross_filter.
tree_cross_filter_url = `${R2_BASE}/isamples_202608_facet_tree_cross_filter.parquet`
// #293 bitmask filter: per-pid tree-membership masks + the concept_uri→bit map.
// facetFilterSQL filters broad multi-tree selections with a columnar bitwise
// predicate over sample_facet_masks (~10 MB, one row/pid) instead of the
// 39M-row membership GROUP BY that stalls DuckDB-WASM. node_bits (~56 rows)
// maps each selected node to its bit. Best-effort: if either fails to load,
// facetFilterSQL falls back to the membership scan (no regression).
masks_url = `${R2_BASE}/isamples_202608_sample_facet_masks.parquet`
node_bits_url = `${R2_BASE}/isamples_202608_facet_node_bits.parquet`

// Canonical palette — see issue #113. Path-relative so this works under
// both isamples.org (custom domain at root) and project-pages fork
Expand Down Expand Up @@ -1114,6 +1122,31 @@ function syncFacetNote() {
// without a JOIN and avoids duplicate rows from multi-valued facets
// (a sample with two materials would appear twice via JOIN). Required
// for Phase 4's table mode and any non-JOIN caller. See issue #156.
// #293: build one "(<dim>_mask & <selected>) <> 0" predicate per active tree dim
// from the concept_uri→bit map (window.__nodeBits). The mask is a BigInt OR of the
// selected nodes' bits; sample_facet_masks already encodes the ancestor closure, so
// selecting a parent matches its whole subtree. Returns [] (→ caller uses the
// membership fallback) if the bit map isn't loaded, a selected node has no known
// bit, or a dim's mask is empty — never silently under-filter.
function treeMaskClauses(activeTree) {
const bits = (typeof window !== 'undefined') ? window.__nodeBits : null;
if (!bits) return [];
const out = [];
for (const { key, sel } of activeTree) {
const dimBits = bits[key];
if (!dimBits) return [];
let mask = 0n;
for (const uri of sel) {
const idx = dimBits.get(uri);
if (idx === undefined) return []; // unknown node → fall back
mask |= (1n << BigInt(idx));
}
if (mask === 0n) return []; // empty → avoid matching nothing
out.push(`(${key}_mask & ${mask}) <> 0`); // BigInt → decimal BIGINT literal
}
return out;
}

function facetFilterSQL() {
// Each entry is a standalone `pid IN (...)` predicate; multiple are AND-ed.
const parts = [];
Expand All @@ -1123,22 +1156,33 @@ function facetFilterSQL() {
// under every ancestor), so a selected parent matches its whole subtree (no
// client-side descendant expansion). A flat (non-tree) dim filters on facets_v3.
const treeClauses = []; // one "(facet_type='X' AND concept_uri IN(...))" per tree dim
const activeTree = []; // [{key, sel}] for tree-active dims (for the bitmask path)
for (const key of TREE_DIM_KEYS) {
const sel = treeSelection(key);
if (sel.length === 0) continue;
const list = sel.map(s => `'${escSql(s)}'`).join(',');
if (treeActive(key)) {
treeClauses.push(`(facet_type='${key}' AND concept_uri IN (${list}))`);
activeTree.push({ key, sel });
} else {
facetsConds.push(`${key} IN (${list})`);
}
}
// #293: collapse the tree-dim selections into ONE membership scan rather than
// one AND-ed subquery per dim (N full scans → 1 in DuckDB-WASM). AND across dims
// is `HAVING COUNT(DISTINCT facet_type) = <#dims>`; OR within a dim is the
// `concept_uri IN (...)`. Single-dim collapses to the same one scan as before.
if (treeClauses.length > 0) {
parts.push(`pid IN (SELECT pid FROM read_parquet('${membership_url}') WHERE ${treeClauses.join(' OR ')} GROUP BY pid HAVING COUNT(DISTINCT facet_type) = ${treeClauses.length})`);
// #293: prefer the bitmask predicate over sample_facet_masks — a single
// columnar scan with no GROUP BY — which is set-identical to the membership
// form but avoids the 39M-row scan that stalls DuckDB-WASM on broad multi-tree
// selections. Fall back to the collapsed membership scan when the bit map
// isn't available or a selected node has no bit (treeMaskClauses → []), so
// results are always correct even before the mask artifacts are published.
if (activeTree.length > 0) {
const maskClauses = treeMaskClauses(activeTree);
if (maskClauses.length === activeTree.length) {
parts.push(`pid IN (SELECT pid FROM read_parquet('${masks_url}') WHERE ${maskClauses.join(' AND ')})`);
} else {
// Fallback: collapse the tree-dim selections into ONE membership scan
// (OR within, HAVING COUNT(DISTINCT facet_type)=N across).
parts.push(`pid IN (SELECT pid FROM read_parquet('${membership_url}') WHERE ${treeClauses.join(' OR ')} GROUP BY pid HAVING COUNT(DISTINCT facet_type) = ${treeClauses.length})`);
}
}
if (facetsConds.length > 0) {
parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${facets_url}') WHERE ${facetsConds.join(' AND ')})`);
Expand Down Expand Up @@ -1634,6 +1678,56 @@ db = {
}
```

```{ojs}
//| echo: false
//| output: false

// #293: load the concept_uri→bit map once at startup so facetFilterSQL can turn
// a tree-node selection into a bitmask and filter sample_facet_masks with a cheap
// columnar predicate (instead of the 39M-row membership GROUP BY). Stashed on
// window for synchronous access from facetFilterSQL (same pattern as
// window.conceptLabelForUri). Best-effort: on ANY failure window.__nodeBits stays
// null and facetFilterSQL falls back to the membership scan — so this is safe to
// ship before sample_facet_masks / facet_node_bits are published.
nodeBitsReady = {
window.__nodeBits = null;
try {
const rows = await db.query(
`SELECT facet_type, concept_uri, bit_index, build_id FROM read_parquet('${node_bits_url}')`);
const map = { material: new Map(), context: new Map(), object_type: new Map() };
const nbBuilds = new Set();
for (const r of rows) {
if (map[r.facet_type]) map[r.facet_type].set(r.concept_uri, Number(r.bit_index));
nbBuilds.add(r.build_id);
}
const haveBits = ['material', 'context', 'object_type'].every(k => map[k].size > 0);
// node_bits must carry exactly ONE build_id (a mixed file is corrupt; don't
// let a last-row-wins value coincidentally match masks — Codex r2).
if (!haveBits || nbBuilds.size !== 1) return false;
const nbBuild = [...nbBuilds][0];
// Codex P1.1 + P1.2: PREFLIGHT the masks file AND require its build_id to
// match node_bits. This proves masks is present/readable (else the mask
// query path would fail with no retry) and that both artifacts are from the
// SAME generation (else a stale-cached masks file would map bits wrong).
// Only then advertise readiness; otherwise facetFilterSQL uses membership.
const mrows = await db.query(
`SELECT DISTINCT build_id FROM read_parquet('${masks_url}')`);
const maskBuilds = mrows.map(r => r.build_id);
if (maskBuilds.length !== 1 || maskBuilds[0] !== nbBuild) {
console.warn('masks/node_bits build_id mismatch or missing; using membership fallback',
{ nbBuild, maskBuilds });
return false;
}
window.__nodeBits = map;
return true;
} catch (err) {
console.warn('node_bits/masks preflight failed; facetFilterSQL will use the membership fallback:', err);
window.__nodeBits = null;
return false;
}
}
```


```{ojs}
//| echo: false
Expand Down
85 changes: 82 additions & 3 deletions scripts/build_frontend_derived.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@
ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries",
"facet_summaries", "facet_cross_filter", "wide_h3",
"sample_facet_membership", "facet_tree_summaries",
"facet_tree_cross_filter"]
"facet_tree_cross_filter", "facet_node_bits", "sample_facet_masks"]
# #293: max tree nodes per dim that fit in a signed BIGINT mask (bits 0..62).
# Live max is 22 (context); guard so a future vocab explosion fails loudly
# instead of silently overflowing a mask bit.
MASK_MAX_BITS = 63

# Shared SQL expression for sample_facets_v2.description (#277 part 2).
# Appends space-joined concept labels (IC labels across all 4 concept dims)
Expand Down Expand Up @@ -481,6 +485,72 @@ def build_facet_tree_cross_filter(con, out):
) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")


def membership_build_id(con):
# #293 (Codex P1, r2): a fingerprint of the FULL membership generation — not
# just the node set. Both node_bits (positional bit assignment) and masks are
# pure functions of `membership`, so hashing membership content captures every
# change that would alter either artifact (new/dropped pids, re-mapped concepts,
# AND node-set changes). Embedding this id in both lets the explorer refuse the
# mask path unless the two are from the SAME generation (guards a stale-cached
# masks file). Order-independent XOR of per-row hashes (membership grain is
# unique per (pid,facet_type,concept_uri) — validated — so no XOR cancellation).
return con.sql("""
SELECT CAST(COALESCE(bit_xor(hash(pid || chr(31) || facet_type || chr(31) || concept_uri)), 0)
AS VARCHAR)
FROM membership""").fetchone()[0]


def build_facet_node_bits(con, out, build_id):
# #293: authoritative concept_uri -> bit_index assignment per tree dim. The
# explorer loads this to turn a node selection into a bitmask and filter
# sample_facet_masks with a cheap columnar bitwise predicate (replacing the
# 39M-row membership GROUP BY that hits the DuckDB-WASM data-scale wall on
# broad multi-tree selections). bit_index is 0-based, DETERMINISTIC (dense
# rank over distinct concept_uri per facet_type, ordered by URI). The mask
# builder below uses the SAME assignment so they can never drift. We HARD-fail
# if any dim exceeds MASK_MAX_BITS (a signed-BIGINT mask can't hold it).
over = con.sql("""
SELECT facet_type, MAX(bit_index)+1 AS n FROM (
SELECT facet_type, concept_uri,
(ROW_NUMBER() OVER (PARTITION BY facet_type ORDER BY concept_uri) - 1) AS bit_index
FROM (SELECT DISTINCT facet_type, concept_uri FROM membership)
) GROUP BY facet_type HAVING MAX(bit_index)+1 > ?""", params=[MASK_MAX_BITS]).fetchall()
if over:
raise SystemExit(f"FATAL: tree dim(s) exceed {MASK_MAX_BITS} nodes — bitmask overflow: {over}")
con.execute(f"""COPY (
SELECT facet_type, concept_uri,
(ROW_NUMBER() OVER (PARTITION BY facet_type ORDER BY concept_uri) - 1)::INTEGER AS bit_index,
'{build_id}' AS build_id
FROM (SELECT DISTINCT facet_type, concept_uri FROM membership)
ORDER BY facet_type, concept_uri
) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")


def build_sample_facet_masks(con, out, build_id):
# #293: one row per located pid that has ANY tree membership; a BIGINT mask
# per tree dim where bit (1<<bit_index) is set iff the pid is a member of that
# node (membership already encodes the ancestor closure, so a parent node's
# bit is set for the whole subtree). The explorer filters with
# (material_mask & <selected>) <> 0 AND (context_mask & <selected>) <> 0 ...
# which is set-identical to the membership pid-subquery but a single columnar
# scan (no 39M-row scan, no GROUP BY pid). bit assignment == build_facet_node_bits.
con.execute(f"""COPY (
WITH nb AS (
SELECT facet_type, concept_uri,
(1::BIGINT << (ROW_NUMBER() OVER (PARTITION BY facet_type ORDER BY concept_uri) - 1)) AS bitval
FROM (SELECT DISTINCT facet_type, concept_uri FROM membership)
)
SELECT m.pid,
COALESCE(bit_or(CASE WHEN m.facet_type='material' THEN nb.bitval END), 0)::BIGINT AS material_mask,
COALESCE(bit_or(CASE WHEN m.facet_type='context' THEN nb.bitval END), 0)::BIGINT AS context_mask,
COALESCE(bit_or(CASE WHEN m.facet_type='object_type' THEN nb.bitval END), 0)::BIGINT AS object_type_mask,
'{build_id}' AS build_id
FROM membership m JOIN nb ON nb.facet_type=m.facet_type AND nb.concept_uri=m.concept_uri
GROUP BY m.pid
ORDER BY m.pid
) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")


def file_meta(con, path):
n = con.sql(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchone()[0]
schema = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM read_parquet('{path}')").fetchall()]
Expand Down Expand Up @@ -544,11 +614,13 @@ def emit(name, fn):
emit("wide_h3", lambda o: build_wide_h3(con, args.wide, o))

# Hierarchy artifacts (#281/#282) — need vocab_labels for the SKOS tree.
if want("sample_facet_membership") or want("facet_tree_summaries") or want("facet_tree_cross_filter"):
HIER_ARTIFACTS = {"sample_facet_membership", "facet_tree_summaries",
"facet_tree_cross_filter", "facet_node_bits", "sample_facet_masks"}
if any(want(a) for a in HIER_ARTIFACTS):
if not args.vocab_labels:
# Fail loud if the user EXPLICITLY asked for a hierarchy artifact
# (Codex) — silently skipping an explicit --only target is wrong.
explicit = only & {"sample_facet_membership", "facet_tree_summaries", "facet_tree_cross_filter"}
explicit = only & HIER_ARTIFACTS
if explicit:
sys.exit(f"FATAL: --only {sorted(explicit)} requires --vocab-labels <vocab_labels.parquet>")
log("SKIP hierarchy artifacts: pass --vocab-labels <vocab_labels.parquet>", t0)
Expand All @@ -558,6 +630,13 @@ def emit(name, fn):
emit("facet_tree_summaries", lambda o: build_facet_tree_summaries(con, o))
# #290/#293 cross-filter cube — needs membership (above) + samp_geo (source).
emit("facet_tree_cross_filter", lambda o: build_facet_tree_cross_filter(con, o))
# #293 bitmask filter artifacts — needs membership (above). node_bits
# and masks share a node-set build_id so the explorer only uses the mask
# path when the two are from the same generation (Codex P1).
if want("facet_node_bits") or want("sample_facet_masks"):
_bid = membership_build_id(con)
emit("facet_node_bits", lambda o: build_facet_node_bits(con, o, _bid))
emit("sample_facet_masks", lambda o: build_sample_facet_masks(con, o, _bid))

if not args.no_manifest:
log("hashing inputs/outputs for manifest…", t0)
Expand Down
Loading
Loading