Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "joplin-note-categorization-plugin",
"name": "joplin-plugin-note-categorization",
"version": "1.0.0",
"scripts": {
"dist": "webpack --env joplin-plugin-config=buildMain && webpack --env joplin-plugin-config=buildExtraScripts && npm run copyAssets && webpack --env joplin-plugin-config=createArchive",
Expand Down
52 changes: 25 additions & 27 deletions src/commands/testEmbed.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import { fetchAllNotes } from '../pipeline/noteReader';
import { benchmark } from '../pipeline/clustering/benchmark';
import { CategorizationConfig } from '../types/cluster';
import { averageVectors, blendVectors, computeTitleWeight, cosineSimilarity } from '../pipeline/vectorAggregator';
import { NoteVector, WorkerMessage } from '../types/embed';
import { isGenericTitle } from '../utils/titleFilter';
import { log, logErr } from '../utils/logger';
import { getEncoding } from 'js-tiktoken';
import { VectorCache } from '../pipeline/vectorCache';
import { enrichResultsWithTags } from '../pipeline/clustering/postProcess';
import { DEFAULT_CONFIG, isValidEmbeddingVector } from '../pipeline/pipelineConfig';

// We use cl100k_base to approximate token counts for chunking.
// The embedding model (all-MiniLM-L6-v2) uses a WordPiece tokenizer with a
Expand Down Expand Up @@ -117,16 +117,22 @@ export const runTestEmbed = async (installDir: string) => {
const cachedItem = await cache.getItem(note.id);

if (cachedItem && cachedItem.metadata.hash === currentNoteHash) {
log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`);
noteVectors.push({
noteId: note.id,
title: note.title,
vector: cachedItem.vector,
titleWeight: cachedItem.metadata.titleWeight ?? 0,
});
cachedCount++;
currentNoteIndex++;
continue;
if (isValidEmbeddingVector(cachedItem.vector)) {
log(`[${currentNoteIndex + 1}/${notes.length}] cache hit for "${note.title.slice(0, 30)}"`);
noteVectors.push({
noteId: note.id,
title: note.title,
vector: cachedItem.vector,
titleWeight: cachedItem.metadata.titleWeight ?? 0,
});
cachedCount++;
currentNoteIndex++;
continue;
} else {
log(
`[${currentNoteIndex + 1}/${notes.length}] cache invalid (contains null/NaN) for "${note.title.slice(0, 30)}"`,
);
}
}

break;
Expand All @@ -152,23 +158,9 @@ export const runTestEmbed = async (installDir: string) => {
// ── Clustering Benchmark ─────────────────────────────
// Edit this config to compare different algorithms and dimensions.
// Results are printed as a comparison table in the console.
const clusterConfig: CategorizationConfig = {
seed: 42,
metric: 'cosine',
intermediateDim: 10,
intermediateNeighbors: 15,
strategies: [
{ name: 'kmeans-5', algorithm: 'kmeans', K: 5 },
{ name: 'kmedoids-5', algorithm: 'kmedoids', K: 5 },
{ name: 'hdbscan-3', algorithm: 'hdbscan', minClusterSize: 3 },
{ name: 'hdbscan-3-ms2', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 },
{ name: 'hdbscan-5-ms2', algorithm: 'hdbscan', minClusterSize: 5, minSamples: 2 },
],
};

if (noteVectors.length >= 3) {
const vectors = noteVectors.map((nv) => nv.vector);
const results = benchmark(vectors, clusterConfig);
const results = benchmark(vectors, DEFAULT_CONFIG);

const notesMap = new Map(notes.map((n) => [n.id, n]));
const allPipelineDocuments = noteVectors.map((nv) => {
Expand All @@ -191,7 +183,13 @@ export const runTestEmbed = async (installDir: string) => {
clusterNotes.get(c)!.push(noteVectors[i].title);
}
for (const [clusterId, titles] of clusterNotes) {
const label = clusterId < 0 ? 'Noise/Outliers' : `Cluster ${clusterId}`;
const generatedName = res.clusterNames?.[clusterId];
const label =
clusterId < 0
? 'Noise/Outliers'
: generatedName
? `${generatedName} (Cluster ${clusterId})`
: `Cluster ${clusterId}`;
const clusterTags = res.tags?.[clusterId] ? ` [Tags: ${res.tags[clusterId].join(', ')}]` : '';
log(` ${label} (${titles.length} notes)${clusterTags}:`);
for (const title of titles) {
Expand Down
51 changes: 41 additions & 10 deletions src/pipeline/UmapProjector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,40 @@ export class UmapProjector {
}

/**
* Projects high-dimensional vectors to a lower-dimensional space using UMAP.
* @param vectors N vectors of dimension D (N x D)
* @returns N vectors of dimension nComponents
* Projects vectors to a lower-dimensional space using UMAP.
*
* In distance-matrix mode, `vectors` must be index singletons `[[0], [1], ...]`
* because umap-js requires a vectors array to call distanceFn(a, b).
* We encode each point's index as its sole coordinate so the custom distanceFn
* can look up precomputed distances via `distanceMatrix[a[0]][b[0]]`.
*/
public project(vectors: number[][]): number[][] {
public project(vectors: number[][], distanceMatrix?: number[][]): number[][] {
if (vectors.length === 0) return [];

const dim = vectors[0].length;
for (let i = 0; i < vectors.length; i++) {
if (vectors[i].length !== dim) {
throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`);
if (distanceMatrix) {
const n = vectors.length;
if (distanceMatrix.length !== n) {
throw new Error(`Distance matrix size (${distanceMatrix.length}) does not match vectors count (${n})`);
}
for (let i = 0; i < n; i++) {
if (vectors[i].length !== 1) {
throw new Error(
`Vector at index ${i} has dimension ${vectors[i].length}, expected 1 (index singleton)`,
);
}
const idx = vectors[i][0];
if (idx < 0 || idx >= n || !Number.isInteger(idx)) {
throw new Error(
`Vector index at position ${i} is invalid: ${idx}. Must be an integer between 0 and ${n - 1}.`,
);
}
}
} else {
const dim = vectors[0].length;
for (let i = 0; i < vectors.length; i++) {
if (vectors[i].length !== dim) {
throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`);
}
}
}

Expand All @@ -46,7 +69,14 @@ export class UmapProjector {

// nNeighbors must be less than the number of data points
const nNeighbors = Math.max(2, Math.min(this.nNeighbors, vectors.length - 1));
const distanceFn = this.metric === 'euclidean' ? euclideanDistance : cosineDistance;

// When using a precomputed distance matrix, vectors are index singletons [i].
// The distanceFn extracts indices to look up the precomputed distance.
const distanceFn = distanceMatrix
? (a: number[], b: number[]) => distanceMatrix[a[0]][b[0]]
: this.metric === 'euclidean'
? euclideanDistance
: cosineDistance;

const umap = new UMAP({
nComponents: this.nComponents,
Expand All @@ -57,7 +87,8 @@ export class UmapProjector {
});

log(
`UMAP: projecting ${vectors.length} vectors (${dim}D → ${this.nComponents}D), ` +
`UMAP: projecting ${vectors.length} vectors ` +
`${distanceMatrix ? '(using precomputed distance matrix)' : `(${vectors[0].length}D)`}${this.nComponents}D, ` +
`neighbors=${nNeighbors}, seed=${this.seed}`,
);

Expand Down
20 changes: 18 additions & 2 deletions src/pipeline/clustering/benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ function logBenchmarkTable(results: BenchmarkResult[]): void {
* @param config Categorization config with strategies to benchmark
* @returns Benchmark results sorted by silhouette score (descending)
*/
export function benchmark(vectors: number[][], config: CategorizationConfig): BenchmarkResult[] {
export function benchmark(
vectors: number[][],
config: CategorizationConfig,
distanceMatrix?: number[][],
): BenchmarkResult[] {
if (vectors.length === 0) {
log('No vectors to cluster.');
return [];
Expand All @@ -92,7 +96,19 @@ export function benchmark(vectors: number[][], config: CategorizationConfig): Be

// Optionally reduce dimensionality before clustering
let clusteringVectors = vectors;
if (config.intermediateDim !== null) {
if (distanceMatrix) {
// Clustering algos need coordinate vectors, not just pairwise distances.
// UMAP projects the distance matrix into coordinate space (default 10D).
const dim = config.intermediateDim ?? 10;
log(`Native mode: projecting distance matrix to ${dim}D coordinates for clustering...`);
const projector = new UmapProjector({
nComponents: dim,
nNeighbors: config.intermediateNeighbors,
metric: config.metric,
seed: config.seed,
});
clusteringVectors = projector.project(vectors, distanceMatrix);
} else if (config.intermediateDim !== null) {
log(`Reducing ${vectors[0].length}D → ${config.intermediateDim}D for clustering...`);
const projector = new UmapProjector({
nComponents: config.intermediateDim,
Expand Down
Loading
Loading