blogpost-fineweb-v1 / src /clusters.js
hynky's picture
hynky HF staff
add missing agg
bb5da6f
raw
history blame
7.93 kB
import { getColor } from "./colors.mjs"
import { parse } from "papaparse"
import _ from "lodash"
import Plotly from "plotly.js-basic-dist-min"
const DATA_FOLDER = "assets/data/clustering";
const BASE_SIZE = 5.5;
// x0, x1, y0, y1
const DEFAULT_XAXIS = {
showticklabels: false,
showgrid: false,
zeroline: false,
title: {
text: "<a href='https://github.com/huggingface/text-clustering' target='_blank' style='color: inherit;'>The 🍷 FineWeb dataset, clustered and annotated with educational score labels</a>",
font: {
size: 16,
style: "italic",
},
},
range: [5, 15.6461]
}
const DEFAULT_YAXIS = {
showticklabels: false,
showgrid: false,
zeroline: false,
range: [0, 8.5],
}
const getLabelHoverFormat = (row, labelIDToName) => {
return `<b>Text</b>: ${row.text}<br><b>Label</b>: ${labelIDToName[row.label] ?? "Unknown"}<br><b>Edu label</b>: ${row.eduScore}`;
};
// Number of annotations to display
const K = 15;
function createLabelOrderMapping(labels) {
const labelCounts = labels.reduce((acc, label) => {
acc[label] = (acc[label] || 0) + 1;
return acc;
}, {});
const sortedLabels = Object.entries(labelCounts)
.sort((a, b) => b[1] - a[1])
.map((entry) => entry[0]);
const labelOrder = {};
sortedLabels.forEach((label, index) => {
labelOrder[label] = index;
});
return labelOrder;
}
const parseAnnotations = async (file) => {
return (await readCSV(file))
.filter((cluster_summary) => {
return parseInt(cluster_summary.cluster_id) != -1;
})
.map((cluster_summary) => {
return {
x: parseFloat(cluster_summary.cluster_position_x),
y: parseFloat(cluster_summary.cluster_position_y),
label: parseInt(cluster_summary.cluster_id),
text: cluster_summary.cluster_summaries,
};
});
};
const addStylingToAnnotations = (annotations) => {
return annotations.map((annotation) => {
return {
showarrow: false,
font: {
size: 14,
color: "black",
weight: "bold",
},
bgcolor: getColor(annotation.label, 0.6),
borderpad: 2, // Add padding around the text
...annotation,
};
});
};
const getRelevantAnnotations = (annotations, x0, x1, y0, y1, k = K) => {
const relevant_annotations = annotations.filter((annotation) => {
return (
annotation.x >= x0 &&
annotation.x <= x1 &&
annotation.y >= y0 &&
annotation.y <= y1
);
});
return relevant_annotations.sort((a, b) => a.ord - b.ord).slice(0, k);
};
const getMinMaxTracesArea = (traces) => {
const x0 = Math.min(...traces.map((trace) => trace.x));
const x1 = Math.max(...traces.map((trace) => trace.x));
const y0 = Math.min(...traces.map((trace) => trace.y));
const y1 = Math.max(...traces.map((trace) => trace.y));
return { x0, x1, y0, y1 };
};
const readData = async () => {
return (await readCSV(`${DATA_FOLDER}/data.csv`)).map((row) => ({
x: parseFloat(row.X),
y: parseFloat(row.Y),
eduScore: parseFloat(row.edu_labels),
label: parseInt(row.cluster_labels),
text: row.content_display,
}));
};
// The cluster is pretty big, so takes time to donwload
// In the meantime we put there a placeholder image
const destroyPlaceholderImage = (parent) => {
const img = parent.querySelector("img");
console.log(img);
img.remove();
};
export async function plotClusters() {
const parent = document.getElementById("clusters-plot");
// We do a little trolling on users and pretend that we already donwloaded the data by simply showing uniteractive image :)
const data = await readData();
const labelOrder = createLabelOrderMapping(data.map((row) => row.label));
const annotations = addStylingToAnnotations(
await parseAnnotations(`${DATA_FOLDER}/info.csv`)
).map((annot) => {
return {
...annot,
ord: labelOrder[annot.label],
};
});
const labelIDToName = annotations.reduce((acc, annotation) => {
acc[annotation.label] = annotation.text;
return acc;
}, {});
const traces = [
{
type: "scatter",
mode: "markers",
x: data.map((row) => row.x),
y: data.map((row) => row.y),
marker: {
color: data.map((row) => getColor(row.label, 0.4)),
size: BASE_SIZE,
},
hoverinfo: "text",
hovertext: data.map((row) => getLabelHoverFormat(row, labelIDToName)),
hoverlabel: {
bgcolor: "white",
},
},
];
const { x0, x1, y0, y1 } = getMinMaxTracesArea(data);
const layout = {
height: 550,
width: parent.clientWidth,
xaxis: DEFAULT_XAXIS,
yaxis: DEFAULT_YAXIS,
annotations: getRelevantAnnotations(annotations, DEFAULT_XAXIS.range[0], DEFAULT_XAXIS.range[1], DEFAULT_YAXIS.range[0], DEFAULT_YAXIS.range[1]),
font: {
family: "apple-system, Arial, sans-serif",
},
margin: {
t: 0,
b: 50,
l: 0,
r: 0,
},
};
destroyPlaceholderImage(parent);
Plotly.newPlot(parent, traces, layout);
parent.on("plotly_relayout", (eventdata) => {
// First option zoomed in
console.log(eventdata)
if (eventdata["xaxis.range[0]"]) {
const [newx0, newx1] = [
eventdata["xaxis.range[0]"],
eventdata["xaxis.range[1]"],
];
const [newy0, newy1] = [
eventdata["yaxis.range[0]"],
eventdata["yaxis.range[1]"],
];
// Idk maybe we can even recompute the ordering, but I think it's fine to use the global one
const relevant_annotations = getRelevantAnnotations(
annotations,
newx0,
newx1,
newy0,
newy1
);
console.log(x0, x1, y0, y1);
// 1.8 otherwise it's too big
const zoomLevel =
Math.min(
(x1 - x0) / (newx1 - newx0),
(y1 - y0) / (newy1 - newy0)
) / 1.2;
Plotly.update(
parent,
{ "marker.size": BASE_SIZE * zoomLevel },
{ annotations: relevant_annotations },
);
}
// Zoom reset to full outzoomed or to base range
else if (eventdata["xaxis.autorange"] || eventdata["xaxis.range"]) {
const relevant_annotations = getRelevantAnnotations(
annotations,
x0,
x1,
y0,
y1
);
// We wan to always fully zoomed out
const xaxis = _.merge({}, DEFAULT_XAXIS, { range: [x0, x1] });
const yaxis = _.merge({}, DEFAULT_YAXIS, { range: [y0, y1] });
Plotly.update(
parent,
{ "marker.size": BASE_SIZE },
{ annotations: relevant_annotations, xaxis, yaxis }
);
}
});
window.addEventListener("resize", () => {
// If the window size is smaller than 768, we don't care as it's not shown
if (window.innerWidth < 768) {
return;
}
Plotly.relayout(parent, {
width: parent.offsetWidth,
});
});
}
const readCSV = async (file) => {
const data = await fetch(file);
const text = await data.text();
const csv = parse(text, { header: true, skipEmptyLines: true });
return csv.data;
};