hunterhector
commited on
Commit
•
ddc7526
1
Parent(s):
a1ddc25
fix sankey
Browse files
main.py
CHANGED
@@ -52,7 +52,7 @@ front_matter = {
|
|
52 |
},
|
53 |
{
|
54 |
"author": "Nikhil Ranjan",
|
55 |
-
"authorURL": "https://huggingface.co/
|
56 |
"affiliation": "MBZUAI",
|
57 |
"affiliationURL": "",
|
58 |
},
|
@@ -64,56 +64,56 @@ front_matter = {
|
|
64 |
},
|
65 |
{
|
66 |
"author": "Zhen Wang",
|
67 |
-
"authorURL": "
|
68 |
"affiliation": "MBZUAI",
|
69 |
"affiliationURL": "",
|
70 |
},
|
71 |
{
|
72 |
"author": "An Li",
|
73 |
-
"authorURL": "https://huggingface.co/
|
74 |
-
"affiliation": "",
|
75 |
"affiliationURL": "",
|
76 |
},
|
77 |
{
|
78 |
"author": "Zhoujun Cheng",
|
79 |
-
"authorURL": "https://huggingface.co/
|
80 |
-
"affiliation": "",
|
81 |
"affiliationURL": "",
|
82 |
},
|
83 |
{
|
84 |
"author": "Suqi Sun",
|
85 |
-
"authorURL": "https://huggingface.co/
|
86 |
"affiliation": "Petuum, Inc.",
|
87 |
"affiliationURL": "",
|
88 |
},
|
89 |
{
|
90 |
"author": "Cun Mu",
|
91 |
-
"authorURL": "https://huggingface.co/
|
92 |
-
"affiliation": "",
|
93 |
"affiliationURL": "",
|
94 |
},
|
95 |
{
|
96 |
"author": "Victor Miller",
|
97 |
-
"authorURL": "https://huggingface.co/
|
98 |
-
"affiliation": "",
|
99 |
"affiliationURL": "",
|
100 |
},
|
101 |
{
|
102 |
"author": "Yue Peng",
|
103 |
-
"authorURL": "https://huggingface.co/
|
104 |
-
"affiliation": "",
|
105 |
"affiliationURL": "",
|
106 |
},
|
107 |
{
|
108 |
"author": "Eric P. Xing",
|
109 |
-
"authorURL": "
|
110 |
-
"affiliation": "MBZUAI
|
111 |
"affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
|
112 |
},
|
113 |
{
|
114 |
"author": "Zhengzhong Liu",
|
115 |
-
"authorURL": "https://huggingface.co/
|
116 |
-
"affiliation": "",
|
117 |
"affiliationURL": "",
|
118 |
},
|
119 |
],
|
|
|
52 |
},
|
53 |
{
|
54 |
"author": "Nikhil Ranjan",
|
55 |
+
"authorURL": "https://huggingface.co/nikhilranjan",
|
56 |
"affiliation": "MBZUAI",
|
57 |
"affiliationURL": "",
|
58 |
},
|
|
|
64 |
},
|
65 |
{
|
66 |
"author": "Zhen Wang",
|
67 |
+
"authorURL": "",
|
68 |
"affiliation": "MBZUAI",
|
69 |
"affiliationURL": "",
|
70 |
},
|
71 |
{
|
72 |
"author": "An Li",
|
73 |
+
"authorURL": "https://huggingface.co/an1118",
|
74 |
+
"affiliation": "UCSD",
|
75 |
"affiliationURL": "",
|
76 |
},
|
77 |
{
|
78 |
"author": "Zhoujun Cheng",
|
79 |
+
"authorURL": "https://huggingface.co/zhoujun",
|
80 |
+
"affiliation": "UCSD",
|
81 |
"affiliationURL": "",
|
82 |
},
|
83 |
{
|
84 |
"author": "Suqi Sun",
|
85 |
+
"authorURL": "https://huggingface.co/mylibrar",
|
86 |
"affiliation": "Petuum, Inc.",
|
87 |
"affiliationURL": "",
|
88 |
},
|
89 |
{
|
90 |
"author": "Cun Mu",
|
91 |
+
"authorURL": "https://huggingface.co/CarisMu",
|
92 |
+
"affiliation": "MBZUAI",
|
93 |
"affiliationURL": "",
|
94 |
},
|
95 |
{
|
96 |
"author": "Victor Miller",
|
97 |
+
"authorURL": "https://huggingface.co/vamiller12",
|
98 |
+
"affiliation": "Petuum, Inc.",
|
99 |
"affiliationURL": "",
|
100 |
},
|
101 |
{
|
102 |
"author": "Yue Peng",
|
103 |
+
"authorURL": "https://huggingface.co/Dreamever",
|
104 |
+
"affiliation": "MBZUAI",
|
105 |
"affiliationURL": "",
|
106 |
},
|
107 |
{
|
108 |
"author": "Eric P. Xing",
|
109 |
+
"authorURL": "",
|
110 |
+
"affiliation": "MBZUAI",
|
111 |
"affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
|
112 |
},
|
113 |
{
|
114 |
"author": "Zhengzhong Liu",
|
115 |
+
"authorURL": "https://huggingface.co/hunterhector",
|
116 |
+
"affiliation": "Petuum, Inc. / MBZUAI ",
|
117 |
"affiliationURL": "",
|
118 |
},
|
119 |
],
|
web.py
CHANGED
@@ -248,7 +248,7 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
|
|
248 |
# Plot the distribution sankey.
|
249 |
|
250 |
# The filtering percentages
|
251 |
-
|
252 |
100,
|
253 |
96.98,
|
254 |
43.84,
|
@@ -264,13 +264,13 @@ web_filtering_percentages = [
|
|
264 |
web_filtering_steps = [
|
265 |
"Common Crawl",
|
266 |
"Text Extraction",
|
267 |
-
"Language
|
268 |
"URL Filtering",
|
269 |
"Repetition Removal",
|
270 |
-
"Document
|
271 |
-
"Line
|
272 |
-
"Local Exact
|
273 |
-
"Global Fuzzy
|
274 |
]
|
275 |
|
276 |
step_colors = [
|
@@ -285,6 +285,8 @@ step_colors = [
|
|
285 |
'#1f773c', # Lightest green added at the end
|
286 |
]
|
287 |
|
|
|
|
|
288 |
def add_opacity(hex_color, opacity):
|
289 |
# Remove '#' if present
|
290 |
hex_color = hex_color.lstrip('#')
|
@@ -293,29 +295,80 @@ def add_opacity(hex_color, opacity):
|
|
293 |
# Add the opacity value
|
294 |
return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
-
#
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
|
300 |
filtering_sankey_fig = go.Figure(go.Sankey(
|
301 |
node=dict(
|
302 |
-
label=
|
303 |
-
color=
|
304 |
pad=15, # Adjust padding between nodes
|
305 |
thickness=30,
|
306 |
),
|
307 |
link=dict(
|
308 |
-
source=
|
309 |
-
target=
|
310 |
-
value=
|
311 |
-
color=
|
312 |
-
|
313 |
)
|
314 |
))
|
315 |
|
316 |
filtering_sankey_fig.update_layout(
|
317 |
-
title_text="Web Data Filtering
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
319 |
margin=dict(l=0, r=0, t=40, b=0)
|
320 |
)
|
321 |
|
@@ -345,10 +398,10 @@ def web_data():
|
|
345 |
P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and efficient filtering processing that should be consider for any filtering project. However, we are leaving this to future work."),
|
346 |
table_div_qf_filter_data,
|
347 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
|
348 |
-
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
349 |
# The sankey diagram of the filtering percentage
|
350 |
plotly2fasthtml(filtering_sankey_fig),
|
351 |
-
P("
|
352 |
id="section2",),
|
353 |
Section(
|
354 |
H2("Document Preparation"),
|
|
|
248 |
# Plot the distribution sankey.
|
249 |
|
250 |
# The filtering percentages
|
251 |
+
web_remaining_percent = [
|
252 |
100,
|
253 |
96.98,
|
254 |
43.84,
|
|
|
264 |
web_filtering_steps = [
|
265 |
"Common Crawl",
|
266 |
"Text Extraction",
|
267 |
+
"Language ID",
|
268 |
"URL Filtering",
|
269 |
"Repetition Removal",
|
270 |
+
"Document Filtering",
|
271 |
+
"Line Corrections",
|
272 |
+
"Local Exact Dedup",
|
273 |
+
"Global Fuzzy Dedup",
|
274 |
]
|
275 |
|
276 |
step_colors = [
|
|
|
285 |
'#1f773c', # Lightest green added at the end
|
286 |
]
|
287 |
|
288 |
+
grey_color = "#d3d3d3"
|
289 |
+
|
290 |
def add_opacity(hex_color, opacity):
|
291 |
# Remove '#' if present
|
292 |
hex_color = hex_color.lstrip('#')
|
|
|
295 |
# Add the opacity value
|
296 |
return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
|
297 |
|
298 |
+
# Create a list for all the node labels, colors, and values
|
299 |
+
node_labels = []
|
300 |
+
node_colors = []
|
301 |
+
|
302 |
+
# Create source and target for links
|
303 |
+
source = []
|
304 |
+
target = []
|
305 |
+
link_colors = []
|
306 |
+
link_values = []
|
307 |
+
|
308 |
+
# For each step, we have two nodes: remaining and filtered
|
309 |
+
for i, label in enumerate(web_filtering_steps):
|
310 |
+
node_labels.append(f"{label} ({web_remaining_percent[i]}%)")
|
311 |
+
node_colors.append(add_opacity(step_colors[i], 0.85))
|
312 |
+
|
313 |
+
if i > 0:
|
314 |
+
# Nothing filtered at step 0, set the nodes of the remaining percentages.
|
315 |
+
node_labels.append(f"{100 - web_remaining_percent[i]:.2f}%")
|
316 |
+
node_colors.append(grey_color)
|
317 |
+
|
318 |
+
# From the previous remaining part to the current remaining part.
|
319 |
+
if i == 1:
|
320 |
+
# Nothing got filtered before step 1.
|
321 |
+
prev_remain_idx = 0
|
322 |
+
curr_remain_idx = 1
|
323 |
+
curr_filtered_idx = 2
|
324 |
+
else:
|
325 |
+
prev_remain_idx = 2 * i - 3
|
326 |
+
prev_filtered_idx = 2 * i - 2
|
327 |
+
curr_remain_idx = 2 * i - 1
|
328 |
+
curr_filtered_idx = 2 * i
|
329 |
|
330 |
+
# Previous remaining -> current remaining
|
331 |
+
source.append(prev_remain_idx)
|
332 |
+
target.append(curr_remain_idx)
|
333 |
+
link_colors.append(add_opacity(step_colors[i-1], 0.7))
|
334 |
+
link_values.append(web_remaining_percent[i])
|
335 |
+
|
336 |
+
# Previous remaining -> current filtered
|
337 |
+
source.append(prev_remain_idx)
|
338 |
+
target.append(curr_filtered_idx)
|
339 |
+
link_colors.append(add_opacity(step_colors[i-1], 0.5))
|
340 |
+
link_values.append(web_remaining_percent[i-1] - web_remaining_percent[i])
|
341 |
+
|
342 |
+
if i > 1:
|
343 |
+
# We have data filtered out at step 1, previous filtered -> current filtered
|
344 |
+
source.append(prev_filtered_idx)
|
345 |
+
target.append(curr_filtered_idx)
|
346 |
+
link_colors.append(grey_color)
|
347 |
+
link_values.append(100 - web_remaining_percent[i - 1])
|
348 |
|
349 |
filtering_sankey_fig = go.Figure(go.Sankey(
|
350 |
node=dict(
|
351 |
+
label=node_labels,
|
352 |
+
color=node_colors,
|
353 |
pad=15, # Adjust padding between nodes
|
354 |
thickness=30,
|
355 |
),
|
356 |
link=dict(
|
357 |
+
source=source, # Source from remaining
|
358 |
+
target=target, # Target to filtered
|
359 |
+
value=link_values, # Interleaved remaining and filtered values
|
360 |
+
color=link_colors
|
|
|
361 |
)
|
362 |
))
|
363 |
|
364 |
filtering_sankey_fig.update_layout(
|
365 |
+
title_text="Web Data Filtering Percentage",
|
366 |
+
title_x=0.5, # Centers the title
|
367 |
+
title_font=dict(
|
368 |
+
family="Arial, sans-serif", # Font family
|
369 |
+
size=18, # Font size
|
370 |
+
),
|
371 |
+
font_size=8,
|
372 |
margin=dict(l=0, r=0, t=40, b=0)
|
373 |
)
|
374 |
|
|
|
398 |
P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and efficient filtering processing that should be consider for any filtering project. However, we are leaving this to future work."),
|
399 |
table_div_qf_filter_data,
|
400 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
|
401 |
+
# Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
402 |
# The sankey diagram of the filtering percentage
|
403 |
plotly2fasthtml(filtering_sankey_fig),
|
404 |
+
P("A significant portion of the documents is filtered after the whole process. This figure illustrates the percentage of documents filtered at each step. The grey bars represent the filtered documents. The statistics are largely consistent with prior work (e.g., RefinedWeb) across most steps, though we have incorporated some custom filtering steps."),
|
405 |
id="section2",),
|
406 |
Section(
|
407 |
H2("Document Preparation"),
|