Spaces:
Running
Running
ping98k
commited on
Commit
·
12c4198
1
Parent(s):
46bbd3d
Refactor K-Means clustering implementation; modularize embedding and clustering logic, enhance heatmap and scatter plot functions, and improve cluster naming process.
Browse files- cluster_naming.js +34 -0
- clustering.js +68 -0
- embedding.js +32 -0
- main.js +33 -164
- plotting.js +35 -0
cluster_naming.js
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { AutoTokenizer, AutoModelForCausalLM } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.6.0';
|
2 |
+
import { prompt_cluster } from "./prompt_cluster.js";
|
3 |
+
|
4 |
+
const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B-ONNX");
|
5 |
+
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
|
6 |
+
|
7 |
+
export async function nameCluster(lines) {
|
8 |
+
const joined = lines.join("\n");
|
9 |
+
const messages = [
|
10 |
+
{ role: "system", content: prompt_cluster },
|
11 |
+
{ role: "user", content: `Input:\n${joined}\nOutput:` }
|
12 |
+
];
|
13 |
+
const inputs = tokenizer.apply_chat_template(messages, {
|
14 |
+
add_generation_prompt: true,
|
15 |
+
return_dict: true,
|
16 |
+
enable_thinking: false,
|
17 |
+
});
|
18 |
+
const outputTokens = await model.generate({
|
19 |
+
...inputs,
|
20 |
+
max_new_tokens: 1024,
|
21 |
+
do_sample: true,
|
22 |
+
temperature: 0.6
|
23 |
+
});
|
24 |
+
let rawName = tokenizer.decode(outputTokens[0], { skip_special_tokens: false }).trim();
|
25 |
+
const THINK_TAG = "</think>";
|
26 |
+
const END_TAG = "<|im_end|>";
|
27 |
+
if (rawName.includes(THINK_TAG)) {
|
28 |
+
rawName = rawName.substring(rawName.lastIndexOf(THINK_TAG) + THINK_TAG.length).trim();
|
29 |
+
}
|
30 |
+
if (rawName.includes(END_TAG)) {
|
31 |
+
rawName = rawName.substring(0, rawName.indexOf(END_TAG)).trim();
|
32 |
+
}
|
33 |
+
return rawName;
|
34 |
+
}
|
clustering.js
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm";
|
2 |
+
|
3 |
+
export function kmeans(embeddings, k, maxIter = 100) {
|
4 |
+
const n = embeddings.length;
|
5 |
+
const dim = embeddings[0].length;
|
6 |
+
let centroids = Array.from({ length: k }, () => embeddings[Math.floor(Math.random() * n)].slice());
|
7 |
+
let labels = new Array(n).fill(0);
|
8 |
+
|
9 |
+
const reseed = () => {
|
10 |
+
let bestIdx = 0, bestDist = -1;
|
11 |
+
for (let i = 0; i < n; ++i) {
|
12 |
+
let minDist = Infinity;
|
13 |
+
for (let c = 0; c < k; ++c) {
|
14 |
+
let dist = 0;
|
15 |
+
for (let d = 0; d < dim; ++d)
|
16 |
+
dist += (embeddings[i][d] - centroids[c][d]) ** 2;
|
17 |
+
if (dist < minDist) minDist = dist;
|
18 |
+
}
|
19 |
+
if (minDist > bestDist) {
|
20 |
+
bestDist = minDist;
|
21 |
+
bestIdx = i;
|
22 |
+
}
|
23 |
+
}
|
24 |
+
return embeddings[bestIdx].slice();
|
25 |
+
};
|
26 |
+
|
27 |
+
for (let iter = 0; iter < maxIter; ++iter) {
|
28 |
+
let changed = false;
|
29 |
+
for (let i = 0; i < n; ++i) {
|
30 |
+
let best = 0, bestDist = Infinity;
|
31 |
+
for (let c = 0; c < k; ++c) {
|
32 |
+
let dist = 0;
|
33 |
+
for (let d = 0; d < dim; ++d)
|
34 |
+
dist += (embeddings[i][d] - centroids[c][d]) ** 2;
|
35 |
+
if (dist < bestDist) {
|
36 |
+
bestDist = dist;
|
37 |
+
best = c;
|
38 |
+
}
|
39 |
+
}
|
40 |
+
if (labels[i] !== best) {
|
41 |
+
labels[i] = best;
|
42 |
+
changed = true;
|
43 |
+
}
|
44 |
+
}
|
45 |
+
centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
|
46 |
+
const counts = new Array(k).fill(0);
|
47 |
+
for (let i = 0; i < n; ++i) {
|
48 |
+
counts[labels[i]]++;
|
49 |
+
for (let d = 0; d < dim; ++d)
|
50 |
+
centroids[labels[i]][d] += embeddings[i][d];
|
51 |
+
}
|
52 |
+
for (let c = 0; c < k; ++c) {
|
53 |
+
if (counts[c] === 0) {
|
54 |
+
centroids[c] = reseed();
|
55 |
+
} else {
|
56 |
+
for (let d = 0; d < dim; ++d)
|
57 |
+
centroids[c][d] /= counts[c];
|
58 |
+
}
|
59 |
+
}
|
60 |
+
if (!changed) break;
|
61 |
+
}
|
62 |
+
return { labels, centroids };
|
63 |
+
}
|
64 |
+
|
65 |
+
export function runUMAP(embeddings, nNeighbors = 15) {
|
66 |
+
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.1 });
|
67 |
+
return umap.fit(embeddings);
|
68 |
+
}
|
embedding.js
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.6.0';
|
2 |
+
|
3 |
+
const embed = await pipeline(
|
4 |
+
"feature-extraction",
|
5 |
+
"onnx-community/Qwen3-Embedding-0.6B-ONNX",
|
6 |
+
{ device: "webgpu", dtype: "q4f16" },
|
7 |
+
);
|
8 |
+
|
9 |
+
export async function getGroupEmbeddings(groups, task) {
|
10 |
+
const groupEmbeddings = [];
|
11 |
+
for (const g of groups) {
|
12 |
+
// Remove lines starting with ##
|
13 |
+
const lines = g.split(/\n/)
|
14 |
+
.map(x => x.trim())
|
15 |
+
.filter(x => x && !x.startsWith('##'));
|
16 |
+
const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`);
|
17 |
+
const out = await embed(prompts, { pooling: "mean", normalize: true });
|
18 |
+
const embeddings = typeof out.tolist === 'function' ? out.tolist() : out.data;
|
19 |
+
const dim = embeddings[0].length;
|
20 |
+
const avg = new Float32Array(dim);
|
21 |
+
for (const e of embeddings) { for (let i = 0; i < dim; i++) avg[i] += e[i]; }
|
22 |
+
for (let i = 0; i < dim; i++) avg[i] /= embeddings.length;
|
23 |
+
groupEmbeddings.push(avg);
|
24 |
+
}
|
25 |
+
return groupEmbeddings;
|
26 |
+
}
|
27 |
+
|
28 |
+
export async function getLineEmbeddings(lines, task) {
|
29 |
+
const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`);
|
30 |
+
const out = await embed(prompts, { pooling: "mean", normalize: true });
|
31 |
+
return typeof out.tolist === 'function' ? out.tolist() : out.data;
|
32 |
+
}
|
main.js
CHANGED
@@ -1,44 +1,27 @@
|
|
1 |
-
import {
|
2 |
-
import {
|
3 |
-
import {
|
4 |
-
|
5 |
-
|
6 |
-
"onnx-community/Qwen3-Embedding-0.6B-ONNX",
|
7 |
-
{ device: "webgpu", dtype: "q4f16" },
|
8 |
-
);
|
9 |
-
const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B-ONNX");
|
10 |
-
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
|
11 |
|
12 |
const task = "Given a textual input sentence, retrieve relevant categories that best describe it.";
|
13 |
|
|
|
|
|
|
|
|
|
14 |
document.getElementById("run").onclick = async () => {
|
15 |
const text = document.getElementById("input").value;
|
16 |
-
|
17 |
const groups = text.split(/\n{3,}/);
|
18 |
-
|
19 |
// Extract cluster names from lines starting with ##
|
20 |
const clusterNames = text.split(/\n/)
|
21 |
.map(x => x.trim())
|
22 |
.filter(x => x && x.startsWith('##'))
|
23 |
.map(x => x.replace(/^##\s*/, ''));
|
24 |
-
|
25 |
-
|
26 |
-
const groupEmbeddings = [];
|
27 |
-
for (const g of groups) {
|
28 |
-
// Remove lines starting with ##
|
29 |
-
const lines = g.split(/\n/)
|
30 |
-
.map(x => x.trim())
|
31 |
-
.filter(x => x && !x.startsWith('##'));
|
32 |
-
const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`);
|
33 |
-
const out = await embed(prompts, { pooling: "mean", normalize: true });
|
34 |
-
const embeddings = typeof out.tolist === 'function' ? out.tolist() : out.data;
|
35 |
-
const dim = embeddings[0].length;
|
36 |
-
const avg = new Float32Array(dim);
|
37 |
-
for (const e of embeddings) { for (let i = 0; i < dim; i++) avg[i] += e[i]; }
|
38 |
-
for (let i = 0; i < dim; i++) avg[i] /= embeddings.length;
|
39 |
-
groupEmbeddings.push(avg);
|
40 |
-
}
|
41 |
const n = groupEmbeddings.length;
|
|
|
42 |
const sim = [];
|
43 |
for (let i = 0; i < n; i++) {
|
44 |
const row = [];
|
@@ -53,104 +36,39 @@ document.getElementById("run").onclick = async () => {
|
|
53 |
}
|
54 |
sim.push(row);
|
55 |
}
|
56 |
-
//
|
57 |
let xLabels = clusterNames && clusterNames.length === n ? clusterNames : Array.from({ length: n }, (_, i) => `Group ${i + 1}`);
|
58 |
-
|
59 |
-
Plotly.newPlot("plot-heatmap", data, {
|
60 |
-
xaxis: { title: "Group", scaleanchor: "y", scaleratio: 1 },
|
61 |
-
yaxis: { title: "Group", scaleanchor: "x", scaleratio: 1 },
|
62 |
-
width: 500,
|
63 |
-
height: 500,
|
64 |
-
margin: { t: 40, l: 200, r: 10, b: 200 },
|
65 |
-
title: "Group Similarity Heatmap"
|
66 |
-
});
|
67 |
};
|
68 |
|
|
|
|
|
69 |
document.getElementById("kmeans-btn").onclick = async () => {
|
70 |
const progressBar = document.getElementById("progress-bar");
|
71 |
const progressBarInner = document.getElementById("progress-bar-inner");
|
72 |
progressBar.style.display = "block";
|
73 |
-
progressBarInner.style.width = "0%";
|
74 |
|
75 |
const text = document.getElementById("input").value;
|
|
|
76 |
const lines = text.split(/\n/).map(x => x.trim()).filter(x => x && !x.startsWith("##"));
|
77 |
-
const
|
78 |
-
const out = await embed(prompts, { pooling: "mean", normalize: true });
|
79 |
-
const embeddings = typeof out.tolist === "function" ? out.tolist() : out.data;
|
80 |
-
|
81 |
const n = embeddings.length;
|
82 |
if (n < 2) return;
|
83 |
-
|
84 |
const requestedK = parseInt(document.getElementById("kmeans-k").value) || 3;
|
85 |
const k = Math.max(2, Math.min(requestedK, n));
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
const reseed = () => {
|
92 |
-
let bestIdx = 0, bestDist = -1;
|
93 |
-
for (let i = 0; i < n; ++i) {
|
94 |
-
let minDist = Infinity;
|
95 |
-
for (let c = 0; c < k; ++c) {
|
96 |
-
let dist = 0;
|
97 |
-
for (let d = 0; d < dim; ++d)
|
98 |
-
dist += (embeddings[i][d] - centroids[c][d]) ** 2;
|
99 |
-
if (dist < minDist) minDist = dist;
|
100 |
-
}
|
101 |
-
if (minDist > bestDist) {
|
102 |
-
bestDist = minDist;
|
103 |
-
bestIdx = i;
|
104 |
-
}
|
105 |
-
}
|
106 |
-
return embeddings[bestIdx].slice();
|
107 |
-
};
|
108 |
-
|
109 |
-
for (let iter = 0; iter < 100; ++iter) {
|
110 |
-
let changed = false;
|
111 |
-
for (let i = 0; i < n; ++i) {
|
112 |
-
let best = 0, bestDist = Infinity;
|
113 |
-
for (let c = 0; c < k; ++c) {
|
114 |
-
let dist = 0;
|
115 |
-
for (let d = 0; d < dim; ++d)
|
116 |
-
dist += (embeddings[i][d] - centroids[c][d]) ** 2;
|
117 |
-
if (dist < bestDist) {
|
118 |
-
bestDist = dist;
|
119 |
-
best = c;
|
120 |
-
}
|
121 |
-
}
|
122 |
-
if (labels[i] !== best) {
|
123 |
-
labels[i] = best;
|
124 |
-
changed = true;
|
125 |
-
}
|
126 |
-
}
|
127 |
-
|
128 |
-
centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
|
129 |
-
const counts = new Array(k).fill(0);
|
130 |
-
for (let i = 0; i < n; ++i) {
|
131 |
-
counts[labels[i]]++;
|
132 |
-
for (let d = 0; d < dim; ++d)
|
133 |
-
centroids[labels[i]][d] += embeddings[i][d];
|
134 |
-
}
|
135 |
-
for (let c = 0; c < k; ++c) {
|
136 |
-
if (counts[c] === 0) {
|
137 |
-
centroids[c] = reseed();
|
138 |
-
} else {
|
139 |
-
for (let d = 0; d < dim; ++d)
|
140 |
-
centroids[c][d] /= counts[c];
|
141 |
-
}
|
142 |
-
}
|
143 |
-
if (!changed) break;
|
144 |
-
}
|
145 |
-
|
146 |
const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
|
147 |
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.1 });
|
148 |
const proj = umap.fit(embeddings);
|
149 |
-
|
150 |
const clustered = Array.from({ length: k }, () => []);
|
151 |
for (let i = 0; i < n; ++i)
|
152 |
clustered[labels[i]].push(lines[i]);
|
153 |
-
|
154 |
const colors = ["red", "blue", "green", "orange", "purple", "cyan", "magenta", "yellow", "brown", "black", "lime", "navy", "teal", "olive", "maroon", "pink", "gray", "gold", "aqua", "indigo"];
|
155 |
const placeholderNames = Array.from({ length: k }, (_, c) => `Cluster ${c + 1}`);
|
156 |
const traces = Array.from({ length: k }, (_, c) => ({
|
@@ -164,70 +82,21 @@ document.getElementById("kmeans-btn").onclick = async () => {
|
|
164 |
traces[labels[i]].y.push(proj[i][1]);
|
165 |
traces[labels[i]].text.push(lines[i]);
|
166 |
}
|
167 |
-
|
168 |
-
|
169 |
-
yaxis: { title: "UMAP-2", scaleanchor: "x", scaleratio: 1 },
|
170 |
-
width: 1000,
|
171 |
-
height: 500,
|
172 |
-
margin: { t: 40, l: 40, r: 10, b: 40 },
|
173 |
-
title: `K-Means Clustering (k=${k})`,
|
174 |
-
legend: { x: 1.05, y: 0.5, orientation: "v", xanchor: "left", yanchor: "middle" }
|
175 |
-
});
|
176 |
-
|
177 |
const clusterNames = [];
|
178 |
for (let c = 0; c < k; ++c) {
|
179 |
progressBarInner.style.width = `${Math.round(((c + 1) / k) * 100)}%`;
|
180 |
-
|
181 |
-
|
182 |
-
const messages = [
|
183 |
-
{ role: "system", content: prompt_cluster },
|
184 |
-
{ role: "user", content: `Input:\n${joined}\nOutput:` }
|
185 |
-
];
|
186 |
-
|
187 |
-
const inputs = tokenizer.apply_chat_template(messages, {
|
188 |
-
add_generation_prompt: true,
|
189 |
-
return_dict: true,
|
190 |
-
enable_thinking: false,
|
191 |
-
});
|
192 |
-
|
193 |
-
const outputTokens = await model.generate({
|
194 |
-
...inputs,
|
195 |
-
max_new_tokens: 1024,
|
196 |
-
do_sample: true,
|
197 |
-
temperature: 0.6
|
198 |
-
});
|
199 |
-
|
200 |
-
let rawName = tokenizer.decode(outputTokens[0], { skip_special_tokens: false }).trim();
|
201 |
-
|
202 |
-
const THINK_TAG = "</think>";
|
203 |
-
const END_TAG = "<|im_end|>";
|
204 |
-
|
205 |
-
if (rawName.includes(THINK_TAG)) {
|
206 |
-
rawName = rawName.substring(rawName.lastIndexOf(THINK_TAG) + THINK_TAG.length).trim();
|
207 |
-
}
|
208 |
-
if (rawName.includes(END_TAG)) {
|
209 |
-
rawName = rawName.substring(0, rawName.indexOf(END_TAG)).trim();
|
210 |
-
}
|
211 |
-
|
212 |
-
clusterNames.push(rawName || `Cluster ${c + 1}`);
|
213 |
traces[c].name = clusterNames[c];
|
214 |
-
|
215 |
-
|
216 |
-
xaxis: { title: "UMAP-1", scaleanchor: "y", scaleratio: 1 },
|
217 |
-
yaxis: { title: "UMAP-2", scaleanchor: "x", scaleratio: 1 },
|
218 |
-
width: 1000,
|
219 |
-
height: 500,
|
220 |
-
margin: { t: 40, l: 40, r: 10, b: 40 },
|
221 |
-
title: `K-Means Clustering (k=${k})`,
|
222 |
-
legend: { x: 1.05, y: 0.5, orientation: "v", xanchor: "left", yanchor: "middle" }
|
223 |
-
});
|
224 |
-
|
225 |
document.getElementById("input").value = clustered.map((g, i) =>
|
226 |
`## ${clusterNames[i]}\n${g.join("\n")}`
|
227 |
).join("\n\n\n");
|
228 |
-
|
229 |
document.getElementById("run").onclick();
|
230 |
}
|
231 |
-
|
232 |
-
progressBarInner.style.width = "100%"; // Set to 100% after all clusters are named
|
233 |
};
|
|
|
1 |
+
import { getGroupEmbeddings, getLineEmbeddings } from './embedding.js';
|
2 |
+
import { kmeans } from './clustering.js';
|
3 |
+
import { plotHeatmap, plotScatter, updateScatter } from './plotting.js';
|
4 |
+
import { nameCluster } from './cluster_naming.js';
|
5 |
+
import { prompt_cluster } from './prompt_cluster.js';
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
const task = "Given a textual input sentence, retrieve relevant categories that best describe it.";
|
8 |
|
9 |
+
// Heatmap event
|
10 |
+
// Handles group similarity heatmap
|
11 |
+
// Uses group-level embeddings
|
12 |
+
|
13 |
document.getElementById("run").onclick = async () => {
|
14 |
const text = document.getElementById("input").value;
|
|
|
15 |
const groups = text.split(/\n{3,}/);
|
|
|
16 |
// Extract cluster names from lines starting with ##
|
17 |
const clusterNames = text.split(/\n/)
|
18 |
.map(x => x.trim())
|
19 |
.filter(x => x && x.startsWith('##'))
|
20 |
.map(x => x.replace(/^##\s*/, ''));
|
21 |
+
// Get group embeddings (removes ## lines internally)
|
22 |
+
const groupEmbeddings = await getGroupEmbeddings(groups, task);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
const n = groupEmbeddings.length;
|
24 |
+
// Cosine similarity matrix
|
25 |
const sim = [];
|
26 |
for (let i = 0; i < n; i++) {
|
27 |
const row = [];
|
|
|
36 |
}
|
37 |
sim.push(row);
|
38 |
}
|
39 |
+
// Use cluster names as axis labels if available
|
40 |
let xLabels = clusterNames && clusterNames.length === n ? clusterNames : Array.from({ length: n }, (_, i) => `Group ${i + 1}`);
|
41 |
+
plotHeatmap(sim, xLabels, xLabels);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
};
|
43 |
|
44 |
+
// K-Means + UMAP + Cluster Naming event
|
45 |
+
|
46 |
document.getElementById("kmeans-btn").onclick = async () => {
|
47 |
const progressBar = document.getElementById("progress-bar");
|
48 |
const progressBarInner = document.getElementById("progress-bar-inner");
|
49 |
progressBar.style.display = "block";
|
50 |
+
progressBarInner.style.width = "0%";
|
51 |
|
52 |
const text = document.getElementById("input").value;
|
53 |
+
// Remove ## lines for embedding
|
54 |
const lines = text.split(/\n/).map(x => x.trim()).filter(x => x && !x.startsWith("##"));
|
55 |
+
const embeddings = await getLineEmbeddings(lines, task);
|
|
|
|
|
|
|
56 |
const n = embeddings.length;
|
57 |
if (n < 2) return;
|
|
|
58 |
const requestedK = parseInt(document.getElementById("kmeans-k").value) || 3;
|
59 |
const k = Math.max(2, Math.min(requestedK, n));
|
60 |
+
// K-Means clustering
|
61 |
+
const { labels } = kmeans(embeddings, k);
|
62 |
+
// UMAP projection
|
63 |
+
const { UMAP } = await import('https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
|
65 |
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.1 });
|
66 |
const proj = umap.fit(embeddings);
|
67 |
+
// Group lines by cluster
|
68 |
const clustered = Array.from({ length: k }, () => []);
|
69 |
for (let i = 0; i < n; ++i)
|
70 |
clustered[labels[i]].push(lines[i]);
|
71 |
+
// Prepare scatter plot traces
|
72 |
const colors = ["red", "blue", "green", "orange", "purple", "cyan", "magenta", "yellow", "brown", "black", "lime", "navy", "teal", "olive", "maroon", "pink", "gray", "gold", "aqua", "indigo"];
|
73 |
const placeholderNames = Array.from({ length: k }, (_, c) => `Cluster ${c + 1}`);
|
74 |
const traces = Array.from({ length: k }, (_, c) => ({
|
|
|
82 |
traces[labels[i]].y.push(proj[i][1]);
|
83 |
traces[labels[i]].text.push(lines[i]);
|
84 |
}
|
85 |
+
plotScatter(traces, k);
|
86 |
+
// Cluster naming
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
const clusterNames = [];
|
88 |
for (let c = 0; c < k; ++c) {
|
89 |
progressBarInner.style.width = `${Math.round(((c + 1) / k) * 100)}%`;
|
90 |
+
const name = await nameCluster(clustered[c]);
|
91 |
+
clusterNames.push(name || `Cluster ${c + 1}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
traces[c].name = clusterNames[c];
|
93 |
+
updateScatter(traces, k);
|
94 |
+
// Update textarea with cluster names as markdown headers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
document.getElementById("input").value = clustered.map((g, i) =>
|
96 |
`## ${clusterNames[i]}\n${g.join("\n")}`
|
97 |
).join("\n\n\n");
|
98 |
+
// Update heatmap with new cluster names
|
99 |
document.getElementById("run").onclick();
|
100 |
}
|
101 |
+
progressBarInner.style.width = "100%";
|
|
|
102 |
};
|
plotting.js
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export function plotHeatmap(sim, xLabels, yLabels) {
|
2 |
+
const data = [{ z: sim, type: "heatmap", colorscale: "Viridis", zmin: 0.7, zmax: 1, x: xLabels, y: yLabels }];
|
3 |
+
Plotly.newPlot("plot-heatmap", data, {
|
4 |
+
xaxis: { title: "Group", scaleanchor: "y", scaleratio: 1 },
|
5 |
+
yaxis: { title: "Group", scaleanchor: "x", scaleratio: 1 },
|
6 |
+
width: 500,
|
7 |
+
height: 500,
|
8 |
+
margin: { t: 40, l: 200, r: 10, b: 200 },
|
9 |
+
title: "Group Similarity Heatmap"
|
10 |
+
});
|
11 |
+
}
|
12 |
+
|
13 |
+
export function plotScatter(traces, k) {
|
14 |
+
Plotly.newPlot("plot-scatter", traces, {
|
15 |
+
xaxis: { title: "UMAP-1", scaleanchor: "y", scaleratio: 1 },
|
16 |
+
yaxis: { title: "UMAP-2", scaleanchor: "x", scaleratio: 1 },
|
17 |
+
width: 1000,
|
18 |
+
height: 500,
|
19 |
+
margin: { t: 40, l: 40, r: 10, b: 40 },
|
20 |
+
title: `K-Means Clustering (k=${k})`,
|
21 |
+
legend: { x: 1.05, y: 0.5, orientation: "v", xanchor: "left", yanchor: "middle" }
|
22 |
+
});
|
23 |
+
}
|
24 |
+
|
25 |
+
export function updateScatter(traces, k) {
|
26 |
+
Plotly.react("plot-scatter", traces, {
|
27 |
+
xaxis: { title: "UMAP-1", scaleanchor: "y", scaleratio: 1 },
|
28 |
+
yaxis: { title: "UMAP-2", scaleanchor: "x", scaleratio: 1 },
|
29 |
+
width: 1000,
|
30 |
+
height: 500,
|
31 |
+
margin: { t: 40, l: 40, r: 10, b: 40 },
|
32 |
+
title: `K-Means Clustering (k=${k})`,
|
33 |
+
legend: { x: 1.05, y: 0.5, orientation: "v", xanchor: "left", yanchor: "middle" }
|
34 |
+
});
|
35 |
+
}
|