From df354b33d08d00772d181c228e8b64fe937cc4cc Mon Sep 17 00:00:00 2001 From: Harsh16gupta Date: Sat, 4 Jul 2026 16:50:12 +0530 Subject: [PATCH 1/2] feat: auto-name clusters using TF-IDF taxonomy with inline rename UI --- package.json | 2 +- src/commands/testEmbed.ts | 52 ++- src/pipeline/UmapProjector.ts | 51 ++- src/pipeline/clustering/benchmark.ts | 20 +- src/pipeline/clustering/postProcess.ts | 499 ++++++++++++++++++++++-- src/pipeline/nativeEmbeddingPipeline.ts | 71 ++++ src/pipeline/pipelineConfig.ts | 23 ++ src/pipeline/runPipeline.ts | 115 ++++-- src/types/cluster.ts | 2 + src/webview/components/ClusterCard.tsx | 70 +++- src/webview/context/AppStateContext.tsx | 16 + src/webview/pages/DashboardPage.tsx | 6 +- src/webview/panel.css | 43 ++ src/worker/embedWorker.ts | 59 ++- 14 files changed, 903 insertions(+), 126 deletions(-) create mode 100644 src/pipeline/nativeEmbeddingPipeline.ts create mode 100644 src/pipeline/pipelineConfig.ts diff --git a/package.json b/package.json index 72c705f..1805a32 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "name": "joplin-note-categorization-plugin", + "name": "joplin-plugin-note-categorization", "version": "1.0.0", "scripts": { "dist": "webpack --env joplin-plugin-config=buildMain && webpack --env joplin-plugin-config=buildExtraScripts && npm run copyAssets && webpack --env joplin-plugin-config=createArchive", diff --git a/src/commands/testEmbed.ts b/src/commands/testEmbed.ts index 28b1fba..7e9265f 100644 --- a/src/commands/testEmbed.ts +++ b/src/commands/testEmbed.ts @@ -1,6 +1,5 @@ import { fetchAllNotes } from '../pipeline/noteReader'; import { benchmark } from '../pipeline/clustering/benchmark'; -import { CategorizationConfig } from '../types/cluster'; import { averageVectors, blendVectors, computeTitleWeight, cosineSimilarity } from '../pipeline/vectorAggregator'; import { NoteVector, WorkerMessage } from '../types/embed'; import { isGenericTitle } from '../utils/titleFilter'; @@ -8,6 +7,7 @@ import { log, logErr } from '../utils/logger'; import { getEncoding } from 'js-tiktoken'; import { VectorCache } from '../pipeline/vectorCache'; import { enrichResultsWithTags } from '../pipeline/clustering/postProcess'; +import { DEFAULT_CONFIG, isValidEmbeddingVector } from '../pipeline/pipelineConfig'; // We use cl100k_base to approximate token counts for chunking. // The embedding model (all-MiniLM-L6-v2) uses a WordPiece tokenizer with a @@ -117,16 +117,22 @@ export const runTestEmbed = async (installDir: string) => { const cachedItem = await cache.getItem(note.id); if (cachedItem && cachedItem.metadata.hash === currentNoteHash) { - log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`); - noteVectors.push({ - noteId: note.id, - title: note.title, - vector: cachedItem.vector, - titleWeight: cachedItem.metadata.titleWeight ?? 0, - }); - cachedCount++; - currentNoteIndex++; - continue; + if (isValidEmbeddingVector(cachedItem.vector)) { + log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`); + noteVectors.push({ + noteId: note.id, + title: note.title, + vector: cachedItem.vector, + titleWeight: cachedItem.metadata.titleWeight ?? 0, + }); + cachedCount++; + currentNoteIndex++; + continue; + } else { + log( + `[${currentNoteIndex + 1}/${notes.length}] cache invalid (contains null/NaN) for "${note.title.slice(0, 30)}"`, + ); + } } break; @@ -152,23 +158,9 @@ export const runTestEmbed = async (installDir: string) => { // ── Clustering Benchmark ───────────────────────────── // Edit this config to compare different algorithms and dimensions. // Results are printed as a comparison table in the console. - const clusterConfig: CategorizationConfig = { - seed: 42, - metric: 'cosine', - intermediateDim: 10, - intermediateNeighbors: 15, - strategies: [ - { name: 'kmeans-5', algorithm: 'kmeans', K: 5 }, - { name: 'kmedoids-5', algorithm: 'kmedoids', K: 5 }, - { name: 'hdbscan-3', algorithm: 'hdbscan', minClusterSize: 3 }, - { name: 'hdbscan-3-ms2', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 }, - { name: 'hdbscan-5-ms2', algorithm: 'hdbscan', minClusterSize: 5, minSamples: 2 }, - ], - }; - if (noteVectors.length >= 3) { const vectors = noteVectors.map((nv) => nv.vector); - const results = benchmark(vectors, clusterConfig); + const results = benchmark(vectors, DEFAULT_CONFIG); const notesMap = new Map(notes.map((n) => [n.id, n])); const allPipelineDocuments = noteVectors.map((nv) => { @@ -191,7 +183,13 @@ export const runTestEmbed = async (installDir: string) => { clusterNotes.get(c)!.push(noteVectors[i].title); } for (const [clusterId, titles] of clusterNotes) { - const label = clusterId < 0 ? 'Noise/Outliers' : `Cluster ${clusterId}`; + const generatedName = res.clusterNames?.[clusterId]; + const label = + clusterId < 0 + ? 'Noise/Outliers' + : generatedName + ? `${generatedName} (Cluster ${clusterId})` + : `Cluster ${clusterId}`; const clusterTags = res.tags?.[clusterId] ? ` [Tags: ${res.tags[clusterId].join(', ')}]` : ''; log(` ${label} (${titles.length} notes)${clusterTags}:`); for (const title of titles) { diff --git a/src/pipeline/UmapProjector.ts b/src/pipeline/UmapProjector.ts index 72a4e18..5081db5 100644 --- a/src/pipeline/UmapProjector.ts +++ b/src/pipeline/UmapProjector.ts @@ -20,17 +20,40 @@ export class UmapProjector { } /** - * Projects high-dimensional vectors to a lower-dimensional space using UMAP. - * @param vectors N vectors of dimension D (N x D) - * @returns N vectors of dimension nComponents + * Projects vectors to a lower-dimensional space using UMAP. + * + * In distance-matrix mode, `vectors` must be index singletons `[[0], [1], ...]` + * because umap-js requires a vectors array to call distanceFn(a, b). + * We encode each point's index as its sole coordinate so the custom distanceFn + * can look up precomputed distances via `distanceMatrix[a[0]][b[0]]`. */ - public project(vectors: number[][]): number[][] { + public project(vectors: number[][], distanceMatrix?: number[][]): number[][] { if (vectors.length === 0) return []; - const dim = vectors[0].length; - for (let i = 0; i < vectors.length; i++) { - if (vectors[i].length !== dim) { - throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`); + if (distanceMatrix) { + const n = vectors.length; + if (distanceMatrix.length !== n) { + throw new Error(`Distance matrix size (${distanceMatrix.length}) does not match vectors count (${n})`); + } + for (let i = 0; i < n; i++) { + if (vectors[i].length !== 1) { + throw new Error( + `Vector at index ${i} has dimension ${vectors[i].length}, expected 1 (index singleton)`, + ); + } + const idx = vectors[i][0]; + if (idx < 0 || idx >= n || !Number.isInteger(idx)) { + throw new Error( + `Vector index at position ${i} is invalid: ${idx}. Must be an integer between 0 and ${n - 1}.`, + ); + } + } + } else { + const dim = vectors[0].length; + for (let i = 0; i < vectors.length; i++) { + if (vectors[i].length !== dim) { + throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`); + } } } @@ -46,7 +69,14 @@ export class UmapProjector { // nNeighbors must be less than the number of data points const nNeighbors = Math.max(2, Math.min(this.nNeighbors, vectors.length - 1)); - const distanceFn = this.metric === 'euclidean' ? euclideanDistance : cosineDistance; + + // When using a precomputed distance matrix, vectors are index singletons [i]. + // The distanceFn extracts indices to look up the precomputed distance. + const distanceFn = distanceMatrix + ? (a: number[], b: number[]) => distanceMatrix[a[0]][b[0]] + : this.metric === 'euclidean' + ? euclideanDistance + : cosineDistance; const umap = new UMAP({ nComponents: this.nComponents, @@ -57,7 +87,8 @@ export class UmapProjector { }); log( - `UMAP: projecting ${vectors.length} vectors (${dim}D → ${this.nComponents}D), ` + + `UMAP: projecting ${vectors.length} vectors ` + + `${distanceMatrix ? '(using precomputed distance matrix)' : `(${vectors[0].length}D)`} → ${this.nComponents}D, ` + `neighbors=${nNeighbors}, seed=${this.seed}`, ); diff --git a/src/pipeline/clustering/benchmark.ts b/src/pipeline/clustering/benchmark.ts index 89aeabf..5d87b0e 100644 --- a/src/pipeline/clustering/benchmark.ts +++ b/src/pipeline/clustering/benchmark.ts @@ -82,7 +82,11 @@ function logBenchmarkTable(results: BenchmarkResult[]): void { * @param config Categorization config with strategies to benchmark * @returns Benchmark results sorted by silhouette score (descending) */ -export function benchmark(vectors: number[][], config: CategorizationConfig): BenchmarkResult[] { +export function benchmark( + vectors: number[][], + config: CategorizationConfig, + distanceMatrix?: number[][], +): BenchmarkResult[] { if (vectors.length === 0) { log('No vectors to cluster.'); return []; @@ -92,7 +96,19 @@ export function benchmark(vectors: number[][], config: CategorizationConfig): Be // Optionally reduce dimensionality before clustering let clusteringVectors = vectors; - if (config.intermediateDim !== null) { + if (distanceMatrix) { + // Clustering algos need coordinate vectors, not just pairwise distances. + // UMAP projects the distance matrix into coordinate space (default 10D). + const dim = config.intermediateDim ?? 10; + log(`Native mode: projecting distance matrix to ${dim}D coordinates for clustering...`); + const projector = new UmapProjector({ + nComponents: dim, + nNeighbors: config.intermediateNeighbors, + metric: config.metric, + seed: config.seed, + }); + clusteringVectors = projector.project(vectors, distanceMatrix); + } else if (config.intermediateDim !== null) { log(`Reducing ${vectors[0].length}D → ${config.intermediateDim}D for clustering...`); const projector = new UmapProjector({ nComponents: config.intermediateDim, diff --git a/src/pipeline/clustering/postProcess.ts b/src/pipeline/clustering/postProcess.ts index 7c88453..e33060d 100644 --- a/src/pipeline/clustering/postProcess.ts +++ b/src/pipeline/clustering/postProcess.ts @@ -145,6 +145,87 @@ export const STOP_WORDS = new Set([ 'yourself', 'yourselves', + // More prepositions, adverbs, and common noise verbs (to clean up phrases) + 'without', + 'within', + 'throughout', + 'around', + 'going', + 'goes', + 'went', + 'getting', + 'got', + 'having', + 'making', + 'taking', + 'actually', + 'really', + 'basically', + 'simply', + 'mainly', + 'mostly', + 'highly', + 'fully', + 'totally', + 'completely', + 'extremely', + 'very', + 'quite', + 'pretty', + 'somewhat', + 'rather', + 'indeed', + 'always', + 'never', + 'sometimes', + 'often', + 'usually', + 'probably', + 'possibly', + 'maybe', + 'crazy', + 'easy', + 'hard', + 'difficult', + 'simple', + 'good', + 'bad', + 'best', + 'worst', + 'better', + 'worse', + 'new', + 'old', + 'first', + 'last', + 'next', + 'prev', + 'previous', + 'current', + 'different', + 'same', + 'other', + 'another', + 'each', + 'every', + 'many', + 'much', + 'few', + 'several', + 'some', + 'any', + 'no', + 'work', + 'thing', + 'things', + 'stuff', + 'name', + 'value', + 'data', + 'user', + 'item', + 'items', + // Markdown/HTML structure words or general noise words (all length >= 3) 'http', 'https', @@ -246,6 +327,9 @@ export const STOP_WORDS = new Set([ /** Words that look like plurals but should not be singularized. */ const SINGULAR_EXCEPTIONS = new Set(['series', 'species', 'means', 'news', 'analysis', 'basis', 'crisis']); +/** Unigrams with character length at or below this threshold receive a 0.5x scoring penalty. */ +const SHORT_UNIGRAM_THRESHOLD = 4; + /** * Strips code blocks, inline code, HTML tags, markdown links/images, and URLs * from text to avoid polluting tag extraction. @@ -361,30 +445,48 @@ export class TfidfExtractor { } } + /** + * Splits the text by sentence/line boundaries and generates ngrams within segments. + * This prevents forming cross-boundary ngrams (like joining separate lines or sentences). + */ + private getSegmentNgrams(text: string): string[] { + if (!text) return []; + // Split by sentence punctuation, newlines, markdown headers, and list bullets + const segments = text.split(/[.,?!;:\n\r\-*#()[\]]+/); + const allNgrams: string[] = []; + for (const seg of segments) { + const tokens = tokenize(seg); + const ngrams = getNgrams(tokens); + for (const ng of ngrams) { + // Filter out any ngrams with consecutive duplicate words (e.g. "day day") + if (!hasConsecutiveDuplicates(ng)) { + allNgrams.push(ng); + } + } + } + return allNgrams; + } + /** * Returns the unique set of words/ngrams in a document (title + body), used for IDF counting. * No title weighting — each document contributes at most 1 to each ngram's document frequency. */ private getUniqueDocumentWords(doc: DocumentText): Set { - const titleWords = tokenize(doc.title || ''); - const bodyWords = tokenize(doc.body || ''); - const titleNgrams = getNgrams(titleWords); - const bodyNgrams = getNgrams(bodyWords); + const titleNgrams = this.getSegmentNgrams(doc.title || ''); + const bodyNgrams = this.getSegmentNgrams(doc.body || ''); return new Set([...titleNgrams, ...bodyNgrams]); } /** - * Returns ngrams for TF scoring with title words weighted 3x higher. + * Returns ngrams for TF scoring with title words weighted 5x higher. * Uses push loops instead of spread to avoid excess intermediate array allocations. */ private getWeightedWords(doc: DocumentText): string[] { - const titleWords = tokenize(doc.title || ''); - const bodyWords = tokenize(doc.body || ''); - const titleNgrams = getNgrams(titleWords); - const bodyNgrams = getNgrams(bodyWords); + const titleNgrams = this.getSegmentNgrams(doc.title || ''); + const bodyNgrams = this.getSegmentNgrams(doc.body || ''); const result: string[] = []; - // Title ngrams appear 3 times to boost their term frequency - for (let i = 0; i < 3; i++) { + // Title ngrams appear 5 times to boost their term frequency + for (let i = 0; i < 5; i++) { for (const ng of titleNgrams) { result.push(ng); } @@ -396,61 +498,341 @@ export class TfidfExtractor { } /** - * Computes TF-IDF scores for ngrams in the cluster documents and returns the top K. + * Computes sorted TF-IDF scores for ngrams in the cluster documents. + * Incorporates Cluster Frequency (CF) weighting, Length Boosting, and Title Match Boosting. */ - public extractClusterTags(clusterDocuments: DocumentText[], topK = 5): string[] { + public extractClusterNgramsWithScores(clusterDocuments: DocumentText[]): { ngram: string; score: number }[] { if (clusterDocuments.length === 0) return []; - const tfs: { [word: string]: number } = {}; - let totalWords = 0; + const tfs: { [ngram: string]: number } = {}; + let totalNgrams = 0; for (const doc of clusterDocuments) { const weighted = this.getWeightedWords(doc); - for (const w of weighted) { - tfs[w] = (tfs[w] || 0) + 1; - totalWords++; + for (const ng of weighted) { + tfs[ng] = (tfs[ng] || 0) + 1; + totalNgrams++; } } - if (totalWords === 0) return []; + if (totalNgrams === 0) return []; + + // Count how many documents in the cluster contain each ngram + const docCounts: { [ngram: string]: number } = {}; + for (const doc of clusterDocuments) { + const titleNgrams = this.getSegmentNgrams(doc.title || ''); + const bodyNgrams = this.getSegmentNgrams(doc.body || ''); + const docNgrams = new Set([...titleNgrams, ...bodyNgrams]); + for (const ng of docNgrams) { + docCounts[ng] = (docCounts[ng] || 0) + 1; + } + } - const scores: { word: string; score: number }[] = []; + const scores: { ngram: string; score: number }[] = []; - for (const word of Object.keys(tfs)) { - const tf = tfs[word] / totalWords; - const idf = this.idfs[word] || 0; // default to 0 if word is ignored/generic + for (const ngram of Object.keys(tfs)) { + const idf = this.idfs[ngram] || 0; // default to 0 if word is ignored/generic if (idf > 0) { - scores.push({ word, score: tf * idf }); + const tf = tfs[ngram] / totalNgrams; + const cf = (docCounts[ngram] || 0) / clusterDocuments.length; + + // Length boost: 1.0x for unigram, 1.5x for bigram, 2.0x for trigram + const wordCount = ngram.split(' ').length; + let lengthBoost = 1.0 + (wordCount - 1) * 0.5; + + // Penalize very short unigrams (length <= 4) to favor longer descriptive phrases + if (wordCount === 1 && ngram.length <= SHORT_UNIGRAM_THRESHOLD) { + lengthBoost *= 0.5; + } + + // Title match boost: 1.5x if it appears in any note title in this cluster + let appearsInTitle = false; + for (const doc of clusterDocuments) { + const titleNgrams = new Set(this.getSegmentNgrams(doc.title || '')); + if (titleNgrams.has(ngram)) { + appearsInTitle = true; + break; + } + } + const titleBoost = appearsInTitle ? 1.5 : 1.0; + + const finalScore = tf * idf * cf * lengthBoost * titleBoost; + scores.push({ ngram, score: finalScore }); } } scores.sort((a, b) => b.score - a.score); + return scores; + } - const selectedTags: string[] = []; - const usedWords = new Set(); + /** + * Computes TF-IDF scores for ngrams in the cluster documents and returns the top K. + */ + public extractClusterTags(clusterDocuments: DocumentText[], topK = 5): string[] { + const scores = this.extractClusterNgramsWithScores(clusterDocuments); + return selectDedupedTags(scores, topK); + } +} - for (const candidate of scores) { - if (selectedTags.length >= topK) break; +/** + * Checks if a phrase contains consecutive identical words. + */ +export function hasConsecutiveDuplicates(phrase: string): boolean { + const words = phrase.toLowerCase().split(' '); + for (let i = 0; i < words.length - 1; i++) { + if (words[i] === words[i + 1]) return true; + } + return false; +} - const constituentWords = candidate.word.split(' '); - const allUsed = constituentWords.every((w) => usedWords.has(w)); - if (!allUsed) { - selectedTags.push(candidate.word); - for (const w of constituentWords) { - usedWords.add(w); +/** + * Filters out unigrams (single-word candidates) that are part of a stronger + * multi-word candidate (bigram/trigram) with a score >= 50% of the unigram's score. + */ +export function filterDemotedUnigrams(scores: { ngram: string; score: number }[]): { ngram: string; score: number }[] { + return scores.filter((candidate) => { + const wordCount = candidate.ngram.split(' ').length; + if (wordCount === 1) { + const hasStrongerPhrase = scores.some((other) => { + const otherWordCount = other.ngram.split(' ').length; + if (otherWordCount > 1) { + const constituentWords = new Set(other.ngram.toLowerCase().split(' ')); + if (constituentWords.has(candidate.ngram.toLowerCase()) && other.score >= candidate.score * 0.5) { + return true; + } } + return false; + }); + if (hasStrongerPhrase) { + return false; } } + return true; + }); +} - return selectedTags; +/** + * Selects up to `topK` tags from pre-computed ngram scores using deduplication rules: + * - Unigrams must be unique (no shared words with already-selected tags) + * - Bigrams/trigrams can share at most 1 word with already-selected tags + */ +export function selectDedupedTags(scores: { ngram: string; score: number }[], topK: number): string[] { + const filteredScores = filterDemotedUnigrams(scores); + const selectedTags: string[] = []; + const usedWords = new Set(); + + for (const candidate of filteredScores) { + if (selectedTags.length >= topK) break; + + const constituentWords = candidate.ngram.split(' '); + const limit = constituentWords.length === 1 ? 0 : 1; + let sharedCount = 0; + for (const w of constituentWords) { + if (usedWords.has(w)) { + sharedCount++; + } + } + + if (sharedCount <= limit) { + selectedTags.push(candidate.ngram); + for (const w of constituentWords) { + usedWords.add(w); + } + } + } + + return selectedTags; +} + +const ACRONYMS = new Set(['sip', 'api', 'ui', 'url', 'html', 'css', 'js', 'db', 'sql', 'onnx']); + +/** + * Capitalizes a phrase to Title Case, preserving common acronyms in uppercase. + */ +export function toTitleCase(phrase: string): string { + return phrase + .split(' ') + .map((word) => { + const lower = word.toLowerCase(); + if (ACRONYMS.has(lower)) { + return word.toUpperCase(); + } + return word.charAt(0).toUpperCase() + word.slice(1); + }) + .join(' '); +} + +/** + * Checks if two phrases share any words (case-insensitive). + */ +export function shareWords(phraseA: string, phraseB: string): boolean { + const wordsA = new Set(phraseA.toLowerCase().split(' ')); + const wordsB = phraseB.toLowerCase().split(' '); + return wordsB.some((w) => wordsA.has(w)); +} + +const TAXONOMY_MAPPING: { keywords: string[]; category: string }[] = [ + { + keywords: ['travel', 'flight', 'trip', 'train', 'vacation', 'backpacking', 'itinerary', 'packing', 'flights'], + category: 'Travel', + }, + { + keywords: [ + 'fund', + 'stock', + 'invest', + 'portfolio', + 'finance', + 'saving', + 'tax', + 'sip', + 'lump', + 'stocks', + 'funds', + 'investment', + 'investments', + ], + category: 'Investment', + }, + { + keywords: ['prep', 'smoothie', 'protein', 'macro', 'macros', 'diet', 'nutrition', 'meal'], + category: 'Meal Prep', + }, + { + keywords: [ + 'recipe', + 'recipes', + 'starter', + 'sourdough', + 'flour', + 'baking', + 'bread', + 'banana', + 'pasta', + 'skillet', + 'cook', + 'cooking', + 'kitchen', + ], + category: 'Recipes', + }, + { + keywords: [ + 'workout', + 'overload', + 'stretch', + 'stretching', + 'routine', + 'pain', + 'fitness', + 'exercise', + 'gym', + 'cardio', + 'back', + 'sitting', + ], + category: 'Workout', + }, + { + keywords: [ + 'code', + 'program', + 'javascript', + 'typescript', + 'node', + 'git', + 'docker', + 'graphql', + 'rest', + 'api', + 'jest', + 'test', + 'error', + 'request', + 'programming', + 'software', + 'developer', + ], + category: 'Programming', + }, + { + keywords: [ + 'psychology', + 'money', + 'meaning', + 'philosophy', + 'ravikant', + 'almanack', + 'book', + 'quotes', + 'thoughts', + 'reading', + 'naval', + ], + category: 'Books & Philosophy', + }, +]; + +/** + * Checks the top 3 ngrams of a cluster against a static taxonomy to match common topics. + */ +export function getTaxonomyCategory(scores: { ngram: string; score: number }[]): string | null { + const candidates = scores.slice(0, 3).map((s) => s.ngram.toLowerCase()); + + for (const cand of candidates) { + const words = cand.split(' '); + for (const mapping of TAXONOMY_MAPPING) { + for (const keyword of mapping.keywords) { + if (words.includes(keyword) || cand === keyword) { + return mapping.category; + } + } + } } + return null; } /** - * Enriches benchmark results with extracted TF-IDF tags for each cluster. + * Generates a descriptive name for a cluster using the scoring list and clusterId. + */ +export function generateClusterName(scores: { ngram: string; score: number }[], clusterId: number): string { + const filteredScores = filterDemotedUnigrams(scores); + + if (filteredScores.length === 0 || filteredScores[0].score <= 0) { + return clusterId % 2 === 0 ? 'General' : 'Miscellaneous'; + } + + // Try matching against high-level taxonomy first + const taxonomyCategory = getTaxonomyCategory(filteredScores); + if (taxonomyCategory) { + return taxonomyCategory; + } + + const top1 = filteredScores[0]; + let top2: { ngram: string; score: number } | undefined; + + // Find the next highest-scoring phrase that doesn't share any words with the first phrase + for (let i = 1; i < filteredScores.length; i++) { + if (filteredScores[i].score <= 0) break; + if (!shareWords(top1.ngram, filteredScores[i].ngram)) { + top2 = filteredScores[i]; + break; + } + } + + // Join them if the second has at least 60% of the score of the first + if (top2 && top2.score >= top1.score * 0.6) { + return `${toTitleCase(top1.ngram)} & ${toTitleCase(top2.ngram)}`; + } + + return toTitleCase(top1.ngram); +} + +/** + * Enriches benchmark results with extracted TF-IDF tags and cluster names for each cluster. * * Builds the TF-IDF corpus from all pipeline documents once, then iterates - * over each strategy result to extract the top tags per cluster. + * over each strategy result to extract the top tags and generated names per cluster. * * @param results Benchmark results from the clustering pipeline * @param documents All note documents used in the pipeline (same order as noteVectors) @@ -461,6 +843,7 @@ export function enrichResultsWithTags(results: BenchmarkResult[], documents: Doc for (const result of results) { const tags: { [clusterId: number]: string[] } = {}; + const clusterNames: { [clusterId: number]: string } = {}; const clusterIndices: { [clusterId: number]: number[] } = {}; result.assignments.forEach((clusterId, noteIdx) => { @@ -472,14 +855,54 @@ export function enrichResultsWithTags(results: BenchmarkResult[], documents: Doc } }); + // Cache ngram scores to avoid recomputation during collision resolution + const cachedScores: { [clusterId: number]: { ngram: string; score: number }[] } = {}; + for (const clusterIdStr of Object.keys(clusterIndices)) { const clusterId = Number(clusterIdStr); const indices = clusterIndices[clusterId]; const clusterDocuments = indices.map((idx) => documents[idx]); - tags[clusterId] = tfidfExtractor.extractClusterTags(clusterDocuments, topK); + const ngramScores = tfidfExtractor.extractClusterNgramsWithScores(clusterDocuments); + cachedScores[clusterId] = ngramScores; + + tags[clusterId] = selectDedupedTags(ngramScores, topK); + clusterNames[clusterId] = generateClusterName(ngramScores, clusterId); + } + + // Count occurrences of each mapped name to identify collisions (e.g. multiple "Recipes" sections) + const nameCounts: { [name: string]: number } = {}; + for (const idStr of Object.keys(clusterNames)) { + const name = clusterNames[Number(idStr)]; + nameCounts[name] = (nameCounts[name] || 0) + 1; + } + + // Resolve duplicates by appending the cluster's top-scoring candidate keyword in parentheses + const usedNames = new Set( + Object.values(clusterNames).filter((name) => nameCounts[name] === 1), + ); + + for (const idStr of Object.keys(clusterNames)) { + const id = Number(idStr); + const name = clusterNames[id]; + if (nameCounts[name] > 1) { + const filteredScores = filterDemotedUnigrams(cachedScores[id]); + if (filteredScores.length > 0 && filteredScores[0].score > 0) { + const subTopic = toTitleCase(filteredScores[0].ngram); + let resolved = `${name} (${subTopic})`; + // Guard against re-collision: append numeric suffix if still duplicate + if (usedNames.has(resolved)) { + let suffix = 2; + while (usedNames.has(`${resolved} ${suffix}`)) suffix++; + resolved = `${resolved} ${suffix}`; + } + clusterNames[id] = resolved; + usedNames.add(resolved); + } + } } result.tags = tags; + result.clusterNames = clusterNames; } } diff --git a/src/pipeline/nativeEmbeddingPipeline.ts b/src/pipeline/nativeEmbeddingPipeline.ts new file mode 100644 index 0000000..761d9e4 --- /dev/null +++ b/src/pipeline/nativeEmbeddingPipeline.ts @@ -0,0 +1,71 @@ +import joplin from 'api'; +import { log } from '../utils/logger'; + +export interface NativeEmbeddingChunk { + noteId: string; + chunkIndex: number; + chunkText: string; + vector: number[]; +} + +/** + * Checks if Joplin's native AI indexing is active and ready. + */ +export const isNativeAiReady = async (): Promise => { + try { + const status = await (joplin as any).ai.getIndexStatus(); + const ready = !!(status && status.ready); + log(`Native AI check - state: ${status?.state}, ready: ${ready}, modelId: ${status?.modelId}`); + return ready; + } catch (err: any) { + log('Native AI check failed:', err.message); + return false; + } +}; + +/** + * Pages through Joplin's native index to fetch raw embedding vectors for the requested notes. + */ +export const fetchNativeEmbeddings = async (noteIds: string[]): Promise => { + if (noteIds.length === 0) return []; + + log(`Fetching native embeddings for ${noteIds.length} notes...`); + const chunks: NativeEmbeddingChunk[] = []; + const BATCH_SIZE = 500; + let modelId: string | null = null; + + for (let i = 0; i < noteIds.length; i += BATCH_SIZE) { + const batchIds = noteIds.slice(i, i + BATCH_SIZE); + let cursor: string | undefined; + const seenCursors = new Set(); + + do { + const page = await (joplin as any).ai.getEmbeddings({ + noteIds: batchIds, + cursor, + limit: 1000, + }); + + if (!page || !Array.isArray(page.chunks)) { + throw new Error('Invalid response from Joplin native getEmbeddings API'); + } + + if (modelId && page.modelId !== modelId) { + throw new Error('Embedding model changed mid-fetch. Please restart.'); + } + modelId = page.modelId; + chunks.push(...page.chunks); + cursor = page.nextCursor; + + if (cursor) { + if (seenCursors.has(cursor)) { + throw new Error('Detected duplicate cursor in pagination, aborting to prevent infinite loop.'); + } + seenCursors.add(cursor); + } + } while (cursor); + } + + log(`Successfully fetched ${chunks.length} embedding chunks`); + return chunks; +}; diff --git a/src/pipeline/pipelineConfig.ts b/src/pipeline/pipelineConfig.ts new file mode 100644 index 0000000..cd67e56 --- /dev/null +++ b/src/pipeline/pipelineConfig.ts @@ -0,0 +1,23 @@ +import { CategorizationConfig } from '../types/cluster'; + +/** Dimensionality of embedding vectors (all-MiniLM-L6-v2 / multilingual-e5-small). */ +export const EMBEDDING_DIM = 384; + +export function isValidEmbeddingVector(vector: number[] | undefined | null): boolean { + if (!vector) return false; + if (vector.length !== EMBEDDING_DIM) return false; + return vector.every((v) => v !== null && !Number.isNaN(v)); +} + +export const DEFAULT_CONFIG: CategorizationConfig = { + seed: 42, + metric: 'cosine', + intermediateDim: 8, + intermediateNeighbors: 5, + strategies: [ + { name: 'kmeans-6', algorithm: 'kmeans', K: 6 }, + { name: 'kmedoids-6', algorithm: 'kmedoids', K: 6 }, + { name: 'hdbscan-tuned', algorithm: 'hdbscan', minClusterSize: 4, minSamples: 1 }, + { name: 'hdbscan-conservative', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 }, + ], +}; diff --git a/src/pipeline/runPipeline.ts b/src/pipeline/runPipeline.ts index dd964f8..62bc63b 100644 --- a/src/pipeline/runPipeline.ts +++ b/src/pipeline/runPipeline.ts @@ -1,6 +1,5 @@ import { fetchAllNotes } from './noteReader'; import { benchmark } from './clustering/benchmark'; -import { CategorizationConfig } from '../types/cluster'; import { averageVectors, blendVectors, computeTitleWeight, cosineSimilarity } from './vectorAggregator'; import { NoteVector, WorkerMessage } from '../types/embed'; import { PanelNote } from '../types/panel'; @@ -9,25 +8,13 @@ import { log, logErr } from '../utils/logger'; import { getEncoding } from 'js-tiktoken'; import { VectorCache } from './vectorCache'; import { enrichResultsWithTags } from './clustering/postProcess'; +import { isNativeAiReady, fetchNativeEmbeddings } from './nativeEmbeddingPipeline'; +import { DEFAULT_CONFIG, isValidEmbeddingVector } from './pipelineConfig'; // See testEmbed.ts for rationale on cl100k_base and the 200-token limit. const enc = getEncoding('cl100k_base'); const MAX_TOKENS = 200; -const DEFAULT_CONFIG: CategorizationConfig = { - seed: 42, - metric: 'cosine', - intermediateDim: 10, - intermediateNeighbors: 15, - strategies: [ - { name: 'kmeans-5', algorithm: 'kmeans', K: 5 }, - { name: 'kmedoids-5', algorithm: 'kmedoids', K: 5 }, - { name: 'hdbscan-3', algorithm: 'hdbscan', minClusterSize: 3 }, - { name: 'hdbscan-3-ms2', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 }, - { name: 'hdbscan-5-ms2', algorithm: 'hdbscan', minClusterSize: 5, minSamples: 2 }, - ], -}; - export interface PipelineCallbacks { onStatus: (text: string) => void; onProgress: (current: number, total: number, cached: number, skipped: number) => void; @@ -52,6 +39,76 @@ export const runPipeline = async (installDir: string, callbacks: PipelineCallbac return; } + if (notes.length < 3) { + callbacks.onError('Too few notes for clustering (need at least 3).'); + return; + } + + if (await isNativeAiReady()) { + log('Native AI Search active: using native embeddings pipeline'); + callbacks.onStatus('Fetching native embeddings...'); + + try { + const noteIds = notes.map((n) => n.id); + const chunks = await fetchNativeEmbeddings(noteIds); + + // Group chunks by noteId + const noteChunksMap = new Map(); + for (const chunk of chunks) { + const list = noteChunksMap.get(chunk.noteId) || []; + list.push(chunk.vector); + noteChunksMap.set(chunk.noteId, list); + } + + const validNotes: typeof notes = []; + const vectors: number[][] = []; + + for (const note of notes) { + const chunkVectors = noteChunksMap.get(note.id); + if (chunkVectors && chunkVectors.length > 0) { + const avgVector = averageVectors(chunkVectors); + + if (isValidEmbeddingVector(avgVector)) { + vectors.push(avgVector); + validNotes.push(note); + } else { + logErr( + `Native embedding for note "${note.title}" contains NaN/null or wrong dimension. Skipping.`, + ); + } + } + } + + log(`Grouped embeddings: found ${vectors.length} notes with valid embeddings.`); + + if (validNotes.length < 3) { + log('Too few indexed notes found in native DB. Falling back to local ONNX Web Worker.'); + } else { + callbacks.onStatus('Clustering...'); + const results = benchmark(vectors, DEFAULT_CONFIG); + + const allPipelineDocuments = validNotes.map((n) => ({ + title: n.title, + body: n.body, + })); + + enrichResultsWithTags(results, allPipelineDocuments); + + const panelNotes: PanelNote[] = validNotes.map((n) => ({ + noteId: n.id, + title: n.title, + })); + + callbacks.onComplete(results, panelNotes); + return; + } + } catch (err: any) { + logErr('Failed to run native embeddings pipeline:', err.message); + } + } + + log('Native AI Search unavailable: falling back to local ONNX Web Worker'); + const cache = await VectorCache.create(); // Remove notes from cache that are no longer in Joplin @@ -142,17 +199,23 @@ export const runPipeline = async (installDir: string, callbacks: PipelineCallbac const cachedItem = await cache.getItem(note.id); if (cachedItem && cachedItem.metadata.hash === currentNoteHash) { - log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`); - noteVectors.push({ - noteId: note.id, - title: note.title, - vector: cachedItem.vector, - titleWeight: cachedItem.metadata.titleWeight ?? 0, - }); - cachedCount++; - currentNoteIndex++; - reportProgress(); - continue; + if (isValidEmbeddingVector(cachedItem.vector)) { + log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`); + noteVectors.push({ + noteId: note.id, + title: note.title, + vector: cachedItem.vector, + titleWeight: cachedItem.metadata.titleWeight ?? 0, + }); + cachedCount++; + currentNoteIndex++; + reportProgress(); + continue; + } else { + log( + `[${currentNoteIndex + 1}/${notes.length}] cache invalid (contains null/NaN) for "${note.title.slice(0, 30)}"`, + ); + } } break; diff --git a/src/types/cluster.ts b/src/types/cluster.ts index 6365583..a2b9271 100644 --- a/src/types/cluster.ts +++ b/src/types/cluster.ts @@ -43,4 +43,6 @@ export interface BenchmarkResult { timeMs: number; /** Extracted tags for each cluster, keyed by cluster ID. Outliers (-1) are excluded. */ tags?: { [clusterId: number]: string[] }; + /** Generated name for each cluster, keyed by cluster ID. Outliers (-1) are excluded. */ + clusterNames?: { [clusterId: number]: string }; } diff --git a/src/webview/components/ClusterCard.tsx b/src/webview/components/ClusterCard.tsx index 334c0cf..03b06c7 100644 --- a/src/webview/components/ClusterCard.tsx +++ b/src/webview/components/ClusterCard.tsx @@ -7,15 +7,49 @@ interface ClusterCardProps { notes: PanelNote[]; isNoise?: boolean; tags?: string[]; + onRename?: (newName: string) => void; } -export const ClusterCard: React.FC = ({ title, noteIndices, notes, isNoise, tags }) => { +export const ClusterCard: React.FC = ({ title, noteIndices, notes, isNoise, tags, onRename }) => { const [isExpanded, setIsExpanded] = React.useState(false); + const [isEditing, setIsEditing] = React.useState(false); + const [editValue, setEditValue] = React.useState(title); + + React.useEffect(() => { + setEditValue(title); + }, [title]); const handleHeaderClick = () => { setIsExpanded((prev) => !prev); }; + const handleEditClick = (e: React.MouseEvent) => { + e.stopPropagation(); + setIsEditing(true); + }; + + const handleSave = (e?: React.FormEvent | React.FocusEvent) => { + if (e) { + e.stopPropagation(); + if ('preventDefault' in e) e.preventDefault(); + } + const trimmed = editValue.trim(); + if (trimmed && trimmed !== title && onRename) { + onRename(trimmed); + } + setIsEditing(false); + }; + + const handleKeyDown = (e: React.KeyboardEvent) => { + e.stopPropagation(); + if (e.key === 'Enter') { + handleSave(); + } else if (e.key === 'Escape') { + setEditValue(title); + setIsEditing(false); + } + }; + const handleNoteClick = (noteId: string) => { webviewApi.postMessage({ type: 'openNote', noteId }); }; @@ -27,7 +61,39 @@ export const ClusterCard: React.FC = ({ title, noteIndices, no
- {title} + {isEditing ? ( + setEditValue(e.target.value)} + onBlur={handleSave} + onKeyDown={handleKeyDown} + onClick={(e) => e.stopPropagation()} + autoFocus + /> + ) : ( +
+ {title} + {!isNoise && onRename && ( + + )} +
+ )} {tags && tags.length > 0 && (
{tags.map((tag, idx) => ( diff --git a/src/webview/context/AppStateContext.tsx b/src/webview/context/AppStateContext.tsx index 220b326..696bbdf 100644 --- a/src/webview/context/AppStateContext.tsx +++ b/src/webview/context/AppStateContext.tsx @@ -17,6 +17,7 @@ interface AppStateContextType { runPipeline: () => void; changeStrategy: (index: number) => void; setView: (view: ViewType) => void; + updateClusterName: (clusterId: number, newName: string) => void; } const AppStateContext = React.createContext(undefined); @@ -125,6 +126,20 @@ export const AppStateProvider: React.FC<{ children: React.ReactNode }> = ({ chil setActiveView(view); }; + const updateClusterName = (clusterId: number, newName: string) => { + setStrategies((prev) => { + const next = [...prev]; + if (next[selectedStrategyIndex]) { + const strat = { ...next[selectedStrategyIndex] }; + const newClusterNames = { ...strat.clusterNames }; + newClusterNames[clusterId] = newName; + strat.clusterNames = newClusterNames; + next[selectedStrategyIndex] = strat; + } + return next; + }); + }; + return ( = ({ chil runPipeline, changeStrategy, setView, + updateClusterName, }} > {children} diff --git a/src/webview/pages/DashboardPage.tsx b/src/webview/pages/DashboardPage.tsx index 5dc3e35..0c008d0 100644 --- a/src/webview/pages/DashboardPage.tsx +++ b/src/webview/pages/DashboardPage.tsx @@ -5,7 +5,8 @@ import { StrategySection } from '../components/StrategySection'; import { ClusterCard } from '../components/ClusterCard'; export const DashboardPage: React.FC = () => { - const { isRunning, runPipeline, strategies, selectedStrategyIndex, changeStrategy, notes } = useAppState(); + const { isRunning, runPipeline, strategies, selectedStrategyIndex, changeStrategy, notes, updateClusterName } = + useAppState(); const selectedStrategy = strategies[selectedStrategyIndex]; @@ -43,10 +44,11 @@ export const DashboardPage: React.FC = () => { {sortedClusterIds.map((id, idx) => ( updateClusterName(id, newName)} /> ))} {noise.length > 0 && ( diff --git a/src/webview/panel.css b/src/webview/panel.css index 18b1020..0b7862c 100644 --- a/src/webview/panel.css +++ b/src/webview/panel.css @@ -443,3 +443,46 @@ body { .config-card-item { margin: 4px 0; } + +/* --- Cluster Title Editing --- */ + +.cluster-title-container { + display: inline-flex; + align-items: center; + gap: 6px; + max-width: 100%; +} + +.cluster-edit-btn { + background: transparent; + border: none; + padding: 2px 4px; + border-radius: 4px; + cursor: pointer; + color: var(--joplin-color); + opacity: 0.4; + display: inline-flex; + align-items: center; + justify-content: center; + transition: opacity 0.15s, background-color 0.15s; +} + +.cluster-edit-btn:hover { + opacity: 0.9; + background-color: var(--joplin-divider-color); +} + +.cluster-title-input { + font-size: 0.9em; + font-weight: 600; + font-family: inherit; + padding: 2px 6px; + border: 1px solid var(--accent); + border-radius: 4px; + background: var(--joplin-background-color); + color: var(--joplin-color); + outline: none; + min-width: 120px; + max-width: 200px; + width: 100%; +} diff --git a/src/worker/embedWorker.ts b/src/worker/embedWorker.ts index 9b5752c..ce0f585 100644 --- a/src/worker/embedWorker.ts +++ b/src/worker/embedWorker.ts @@ -14,12 +14,25 @@ const POOLING = 'mean' as const; env.backends.onnx.wasm!.wasmPaths = '../onnx-dist/'; let embedder: any = null; +let selectedDevice: any = 'wasm'; +let selectedDtype: any = 'q8'; + +const loadWasmFallback = async () => { + selectedDevice = 'wasm'; + selectedDtype = 'q8'; + embedder = await pipeline('feature-extraction', MODEL_ID, { + dtype: selectedDtype, + device: selectedDevice, + }); + const result = await embedder('warmup text', { pooling: POOLING, normalize: true }); + if (result && result.data && result.data.some((v: number) => isNaN(v))) { + throw new Error('WASM fallback warmup returned NaN values'); + } +}; const loadModel = async () => { const t0 = performance.now(); - let selectedDevice: any = 'wasm'; - let selectedDtype: any = 'q8'; let workerGpuExists = false; let adapterFound = false; @@ -43,17 +56,13 @@ const loadModel = async () => { dtype: selectedDtype, device: selectedDevice, }); - await embedder('warmup text', { pooling: POOLING, normalize: true }); + const warmupResult = await embedder('warmup text', { pooling: POOLING, normalize: true }); + if (warmupResult && warmupResult.data && warmupResult.data.some((v: number) => isNaN(v))) { + throw new Error('Warmup returned NaN values (WebGPU fp16 numeric instability)'); + } } catch (e) { if (selectedDevice === 'webgpu') { - // WebGPU pipeline or warmup failed, retry with WASM/q8 - selectedDevice = 'wasm'; - selectedDtype = 'q8'; - embedder = await pipeline('feature-extraction', MODEL_ID, { - dtype: selectedDtype, - device: selectedDevice, - }); - await embedder('warmup text', { pooling: POOLING, normalize: true }); + await loadWasmFallback(); } else { throw e; } @@ -70,16 +79,30 @@ const loadModel = async () => { }; }; -const embed = async (text: string) => { +const embed = async (text: string): Promise<{ inferenceTime: number; dimensions: number; embedding: number[] }> => { if (!embedder) throw new Error('Model not loaded'); - const t0 = performance.now(); - const output = await embedder(text, { pooling: POOLING, normalize: true }); - const inferenceTime = performance.now() - t0; - const dimensions = output.data.length; - const embedding = Array.from(output.data as Float32Array); + try { + const t0 = performance.now(); + const output = await embedder(text, { pooling: POOLING, normalize: true }); + const inferenceTime = performance.now() - t0; + const dimensions = output.data.length; + const embedding = Array.from(output.data as Float32Array); + + if (embedding.some((v) => isNaN(v))) { + throw new Error('Inference returned NaN values'); + } - return { inferenceTime, dimensions, embedding }; + return { inferenceTime, dimensions, embedding }; + } catch (e: any) { + if (selectedDevice === 'webgpu') { + console.warn('WebGPU inference failed or returned NaN. Falling back to WASM/q8 dynamically...', e); + await loadWasmFallback(); + return await embed(text); + } else { + throw e; + } + } }; self.addEventListener('message', async (event) => { From c6b34a5c49e085c77a37edd6fc653e31f61737cc Mon Sep 17 00:00:00 2001 From: Harsh16gupta Date: Sat, 4 Jul 2026 17:01:08 +0530 Subject: [PATCH 2/2] style: format postProcess.ts with prettier --- src/pipeline/clustering/postProcess.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pipeline/clustering/postProcess.ts b/src/pipeline/clustering/postProcess.ts index e33060d..15f2845 100644 --- a/src/pipeline/clustering/postProcess.ts +++ b/src/pipeline/clustering/postProcess.ts @@ -878,9 +878,7 @@ export function enrichResultsWithTags(results: BenchmarkResult[], documents: Doc } // Resolve duplicates by appending the cluster's top-scoring candidate keyword in parentheses - const usedNames = new Set( - Object.values(clusterNames).filter((name) => nameCounts[name] === 1), - ); + const usedNames = new Set(Object.values(clusterNames).filter((name) => nameCounts[name] === 1)); for (const idStr of Object.keys(clusterNames)) { const id = Number(idStr);