joplin · Harsh16gupta · Jul 4, 2026 · Jul 4, 2026
diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "joplin-note-categorization-plugin",
+  "name": "joplin-plugin-note-categorization",
   "version": "1.0.0",
   "scripts": {
     "dist": "webpack --env joplin-plugin-config=buildMain && webpack --env joplin-plugin-config=buildExtraScripts && npm run copyAssets && webpack --env joplin-plugin-config=createArchive",

diff --git a/src/commands/testEmbed.ts b/src/commands/testEmbed.ts
@@ -1,13 +1,13 @@
 import { fetchAllNotes } from '../pipeline/noteReader';
 import { benchmark } from '../pipeline/clustering/benchmark';
-import { CategorizationConfig } from '../types/cluster';
 import { averageVectors, blendVectors, computeTitleWeight, cosineSimilarity } from '../pipeline/vectorAggregator';
 import { NoteVector, WorkerMessage } from '../types/embed';
 import { isGenericTitle } from '../utils/titleFilter';
 import { log, logErr } from '../utils/logger';
 import { getEncoding } from 'js-tiktoken';
 import { VectorCache } from '../pipeline/vectorCache';
 import { enrichResultsWithTags } from '../pipeline/clustering/postProcess';
+import { DEFAULT_CONFIG, isValidEmbeddingVector } from '../pipeline/pipelineConfig';
 
 // We use cl100k_base to approximate token counts for chunking.
 // The embedding model (all-MiniLM-L6-v2) uses a WordPiece tokenizer with a
@@ -117,16 +117,22 @@ export const runTestEmbed = async (installDir: string) => {
 			const cachedItem = await cache.getItem(note.id);
 
 			if (cachedItem && cachedItem.metadata.hash === currentNoteHash) {
-				log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`);
-				noteVectors.push({
-					noteId: note.id,
-					title: note.title,
-					vector: cachedItem.vector,
-					titleWeight: cachedItem.metadata.titleWeight ?? 0,
-				});
-				cachedCount++;
-				currentNoteIndex++;
-				continue;
+				if (isValidEmbeddingVector(cachedItem.vector)) {
+					log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`);
+					noteVectors.push({
+						noteId: note.id,
+						title: note.title,
+						vector: cachedItem.vector,
+						titleWeight: cachedItem.metadata.titleWeight ?? 0,
+					});
+					cachedCount++;
+					currentNoteIndex++;
+					continue;
+				} else {
+					log(
+						`[${currentNoteIndex + 1}/${notes.length}] cache invalid (contains null/NaN) for "${note.title.slice(0, 30)}"`,
+					);
+				}
 			}
 
 			break;
@@ -152,23 +158,9 @@ export const runTestEmbed = async (installDir: string) => {
 			// ── Clustering Benchmark ─────────────────────────────
 			// Edit this config to compare different algorithms and dimensions.
 			// Results are printed as a comparison table in the console.
-			const clusterConfig: CategorizationConfig = {
-				seed: 42,
-				metric: 'cosine',
-				intermediateDim: 10,
-				intermediateNeighbors: 15,
-				strategies: [
-					{ name: 'kmeans-5', algorithm: 'kmeans', K: 5 },
-					{ name: 'kmedoids-5', algorithm: 'kmedoids', K: 5 },
-					{ name: 'hdbscan-3', algorithm: 'hdbscan', minClusterSize: 3 },
-					{ name: 'hdbscan-3-ms2', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 },
-					{ name: 'hdbscan-5-ms2', algorithm: 'hdbscan', minClusterSize: 5, minSamples: 2 },
-				],
-			};
-
 			if (noteVectors.length >= 3) {
 				const vectors = noteVectors.map((nv) => nv.vector);
-				const results = benchmark(vectors, clusterConfig);
+				const results = benchmark(vectors, DEFAULT_CONFIG);
 
 				const notesMap = new Map(notes.map((n) => [n.id, n]));
 				const allPipelineDocuments = noteVectors.map((nv) => {
@@ -191,7 +183,13 @@ export const runTestEmbed = async (installDir: string) => {
 						clusterNotes.get(c)!.push(noteVectors[i].title);
 					}
 					for (const [clusterId, titles] of clusterNotes) {
-						const label = clusterId < 0 ? 'Noise/Outliers' : `Cluster ${clusterId}`;
+						const generatedName = res.clusterNames?.[clusterId];
+						const label =
+							clusterId < 0
+								? 'Noise/Outliers'
+								: generatedName
+									? `${generatedName} (Cluster ${clusterId})`
+									: `Cluster ${clusterId}`;
 						const clusterTags = res.tags?.[clusterId] ? ` [Tags: ${res.tags[clusterId].join(', ')}]` : '';
 						log(`  ${label} (${titles.length} notes)${clusterTags}:`);
 						for (const title of titles) {

diff --git a/src/pipeline/UmapProjector.ts b/src/pipeline/UmapProjector.ts
@@ -20,17 +20,40 @@ export class UmapProjector {
 	}
 
 	/**
-	 * Projects high-dimensional vectors to a lower-dimensional space using UMAP.
-	 * @param vectors N vectors of dimension D (N x D)
-	 * @returns N vectors of dimension nComponents
+	 * Projects vectors to a lower-dimensional space using UMAP.
+	 *
+	 * In distance-matrix mode, `vectors` must be index singletons `[[0], [1], ...]`
+	 * because umap-js requires a vectors array to call distanceFn(a, b).
+	 * We encode each point's index as its sole coordinate so the custom distanceFn
+	 * can look up precomputed distances via `distanceMatrix[a[0]][b[0]]`.
 	 */
-	public project(vectors: number[][]): number[][] {
+	public project(vectors: number[][], distanceMatrix?: number[][]): number[][] {
 		if (vectors.length === 0) return [];
 
-		const dim = vectors[0].length;
-		for (let i = 0; i < vectors.length; i++) {
-			if (vectors[i].length !== dim) {
-				throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`);
+		if (distanceMatrix) {
+			const n = vectors.length;
+			if (distanceMatrix.length !== n) {
+				throw new Error(`Distance matrix size (${distanceMatrix.length}) does not match vectors count (${n})`);
+			}
+			for (let i = 0; i < n; i++) {
+				if (vectors[i].length !== 1) {
+					throw new Error(
+						`Vector at index ${i} has dimension ${vectors[i].length}, expected 1 (index singleton)`,
+					);
+				}
+				const idx = vectors[i][0];
+				if (idx < 0 || idx >= n || !Number.isInteger(idx)) {
+					throw new Error(
+						`Vector index at position ${i} is invalid: ${idx}. Must be an integer between 0 and ${n - 1}.`,
+					);
+				}
+			}
+		} else {
+			const dim = vectors[0].length;
+			for (let i = 0; i < vectors.length; i++) {
+				if (vectors[i].length !== dim) {
+					throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`);
+				}
 			}
 		}
 
@@ -46,7 +69,14 @@ export class UmapProjector {
 
 		// nNeighbors must be less than the number of data points
 		const nNeighbors = Math.max(2, Math.min(this.nNeighbors, vectors.length - 1));
-		const distanceFn = this.metric === 'euclidean' ? euclideanDistance : cosineDistance;
+
+		// When using a precomputed distance matrix, vectors are index singletons [i].
+		// The distanceFn extracts indices to look up the precomputed distance.
+		const distanceFn = distanceMatrix
+			? (a: number[], b: number[]) => distanceMatrix[a[0]][b[0]]
+			: this.metric === 'euclidean'
+				? euclideanDistance
+				: cosineDistance;
 
 		const umap = new UMAP({
 			nComponents: this.nComponents,
@@ -57,7 +87,8 @@ export class UmapProjector {
 		});
 
 		log(
-			`UMAP: projecting ${vectors.length} vectors (${dim}D → ${this.nComponents}D), ` +
+			`UMAP: projecting ${vectors.length} vectors ` +
+				`${distanceMatrix ? '(using precomputed distance matrix)' : `(${vectors[0].length}D)`} → ${this.nComponents}D, ` +
 				`neighbors=${nNeighbors}, seed=${this.seed}`,
 		);
 

diff --git a/src/pipeline/clustering/benchmark.ts b/src/pipeline/clustering/benchmark.ts
@@ -82,7 +82,11 @@ function logBenchmarkTable(results: BenchmarkResult[]): void {
  * @param config   Categorization config with strategies to benchmark
  * @returns        Benchmark results sorted by silhouette score (descending)
  */
-export function benchmark(vectors: number[][], config: CategorizationConfig): BenchmarkResult[] {
+export function benchmark(
+	vectors: number[][],
+	config: CategorizationConfig,
+	distanceMatrix?: number[][],
+): BenchmarkResult[] {
 	if (vectors.length === 0) {
 		log('No vectors to cluster.');
 		return [];
@@ -92,7 +96,19 @@ export function benchmark(vectors: number[][], config: CategorizationConfig): Be
 
 	// Optionally reduce dimensionality before clustering
 	let clusteringVectors = vectors;
-	if (config.intermediateDim !== null) {
+	if (distanceMatrix) {
+		// Clustering algos need coordinate vectors, not just pairwise distances.
+		// UMAP projects the distance matrix into coordinate space (default 10D).
+		const dim = config.intermediateDim ?? 10;
+		log(`Native mode: projecting distance matrix to ${dim}D coordinates for clustering...`);
+		const projector = new UmapProjector({
+			nComponents: dim,
+			nNeighbors: config.intermediateNeighbors,
+			metric: config.metric,
+			seed: config.seed,
+		});
+		clusteringVectors = projector.project(vectors, distanceMatrix);
+	} else if (config.intermediateDim !== null) {
 		log(`Reducing ${vectors[0].length}D → ${config.intermediateDim}D for clustering...`);
 		const projector = new UmapProjector({
 			nComponents: config.intermediateDim,