Spaces:
Running
Running
topic-analysis
#1
by
mylibrar
- opened
- common.py +1 -0
- curated.py +112 -77
- data/topic_charts.json +0 -0
- main.py +10 -4
- results.py +105 -64
- web.py +2 -0
common.py
CHANGED
@@ -298,6 +298,7 @@ global_div = Div(
|
|
298 |
style="margin-bottom: 5px",
|
299 |
),
|
300 |
Li("Normalization Form C Discussion", style="margin-bottom: 5px"),
|
|
|
301 |
),
|
302 |
id="section41",
|
303 |
),
|
|
|
298 |
style="margin-bottom: 5px",
|
299 |
),
|
300 |
Li("Normalization Form C Discussion", style="margin-bottom: 5px"),
|
301 |
+
Li(B("Estimated Reading Time: 10 minutes"), style="margin-bottom: 5px"),
|
302 |
),
|
303 |
id="section41",
|
304 |
),
|
curated.py
CHANGED
@@ -24,6 +24,7 @@ overview = (
|
|
24 |
"Individual Filtering Discussion for Each Source",
|
25 |
style="margin-bottom: 5px",
|
26 |
),
|
|
|
27 |
),
|
28 |
),
|
29 |
)
|
@@ -33,7 +34,9 @@ curated_sources_intro = Div(
|
|
33 |
P(
|
34 |
"Curated sources comprise high-quality datasets that contain domain-specificity.",
|
35 |
B(
|
36 |
-
" TxT360 was strongly influenced by The Pile",
|
|
|
|
|
37 |
),
|
38 |
" These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ",
|
39 |
),
|
@@ -129,16 +132,16 @@ wikipedia_filter = pd.DataFrame(
|
|
129 |
"0.00%",
|
130 |
],
|
131 |
"Percent Removed After Local Dedup": [
|
132 |
-
"",
|
133 |
],
|
134 |
"Total Percentage Remaining": [
|
135 |
-
"",
|
136 |
],
|
137 |
}
|
138 |
)
|
139 |
|
140 |
table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
|
141 |
-
table_div_wikipedia = Div(NotStr(table_html_wikipedia)
|
142 |
|
143 |
freelaw_filter = pd.DataFrame(
|
144 |
{
|
@@ -167,7 +170,7 @@ freelaw_filter = pd.DataFrame(
|
|
167 |
)
|
168 |
|
169 |
table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
|
170 |
-
table_div_freelaw = Div(NotStr(table_html_freelaw)
|
171 |
|
172 |
dmm_filter = pd.DataFrame(
|
173 |
{
|
@@ -187,16 +190,16 @@ dmm_filter = pd.DataFrame(
|
|
187 |
"0.00%",
|
188 |
],
|
189 |
"Percent Removed After Local Dedup": [
|
190 |
-
"",
|
191 |
],
|
192 |
"Total Percentage Remaining": [
|
193 |
-
"%",
|
194 |
],
|
195 |
}
|
196 |
)
|
197 |
|
198 |
table_html_dmm = dmm_filter.to_html(index=False, border=0)
|
199 |
-
table_div_dmm = Div(NotStr(table_html_dmm)
|
200 |
|
201 |
|
202 |
uspto_filter = pd.DataFrame(
|
@@ -217,16 +220,16 @@ uspto_filter = pd.DataFrame(
|
|
217 |
"0.01%",
|
218 |
],
|
219 |
"Percent Removed After Local Dedup": [
|
220 |
-
"",
|
221 |
],
|
222 |
"Total Percentage Remaining": [
|
223 |
-
"%",
|
224 |
],
|
225 |
}
|
226 |
)
|
227 |
|
228 |
table_html_uspto = uspto_filter.to_html(index=False, border=0)
|
229 |
-
table_div_uspto = Div(NotStr(table_html_uspto)
|
230 |
|
231 |
pg19_filter = pd.DataFrame(
|
232 |
{
|
@@ -246,16 +249,16 @@ pg19_filter = pd.DataFrame(
|
|
246 |
"0.17%",
|
247 |
],
|
248 |
"Percent Removed After Local Dedup": [
|
249 |
-
"",
|
250 |
],
|
251 |
"Total Percentage Remaining": [
|
252 |
-
"%",
|
253 |
],
|
254 |
}
|
255 |
)
|
256 |
|
257 |
table_html_pg19 = pg19_filter.to_html(index=False, border=0)
|
258 |
-
table_div_pg19 = Div(NotStr(table_html_pg19)
|
259 |
|
260 |
|
261 |
hn_filter = pd.DataFrame(
|
@@ -267,7 +270,7 @@ hn_filter = pd.DataFrame(
|
|
267 |
"2064931",
|
268 |
],
|
269 |
"Percent Removed After Language Filter": [
|
270 |
-
"2.62
|
271 |
],
|
272 |
"Percent Removed After Min Word Count Filter": [
|
273 |
"0.02%",
|
@@ -276,16 +279,16 @@ hn_filter = pd.DataFrame(
|
|
276 |
"0.34%",
|
277 |
],
|
278 |
"Percent Removed After Local Dedup": [
|
279 |
-
"",
|
280 |
],
|
281 |
"Total Percentage Remaining": [
|
282 |
-
"%",
|
283 |
],
|
284 |
}
|
285 |
)
|
286 |
|
287 |
table_html_hn = hn_filter.to_html(index=False, border=0)
|
288 |
-
table_div_hn = Div(NotStr(table_html_hn)
|
289 |
|
290 |
|
291 |
uirc_filter = pd.DataFrame(
|
@@ -306,16 +309,16 @@ uirc_filter = pd.DataFrame(
|
|
306 |
"1.12%",
|
307 |
],
|
308 |
"Percent Removed After Local Dedup": [
|
309 |
-
"",
|
310 |
],
|
311 |
"Total Percentage Remaining": [
|
312 |
-
"%",
|
313 |
],
|
314 |
}
|
315 |
)
|
316 |
|
317 |
table_html_uirc = uirc_filter.to_html(index=False, border=0)
|
318 |
-
table_div_uirc = Div(NotStr(table_html_uirc)
|
319 |
|
320 |
up_filter = pd.DataFrame(
|
321 |
{
|
@@ -335,16 +338,16 @@ up_filter = pd.DataFrame(
|
|
335 |
"0.00%",
|
336 |
],
|
337 |
"Percent Removed After Local Dedup": [
|
338 |
-
"",
|
339 |
],
|
340 |
"Total Percentage Remaining": [
|
341 |
-
"%",
|
342 |
],
|
343 |
}
|
344 |
)
|
345 |
|
346 |
table_html_up = up_filter.to_html(index=False, border=0)
|
347 |
-
table_div_up = Div(NotStr(table_html_up)
|
348 |
|
349 |
se_filter = pd.DataFrame(
|
350 |
{
|
@@ -364,16 +367,16 @@ se_filter = pd.DataFrame(
|
|
364 |
"0.00%",
|
365 |
],
|
366 |
"Percent Removed After Local Dedup": [
|
367 |
-
"",
|
368 |
],
|
369 |
"Total Percentage Remaining": [
|
370 |
-
"%",
|
371 |
],
|
372 |
}
|
373 |
)
|
374 |
|
375 |
table_html_se = se_filter.to_html(index=False, border=0)
|
376 |
-
table_div_se = Div(NotStr(table_html_se)
|
377 |
|
378 |
arx_filter = pd.DataFrame(
|
379 |
{
|
@@ -393,16 +396,16 @@ arx_filter = pd.DataFrame(
|
|
393 |
"0.07%",
|
394 |
],
|
395 |
"Percent Removed After Local Dedup": [
|
396 |
-
"",
|
397 |
],
|
398 |
"Total Percentage Remaining": [
|
399 |
-
"%",
|
400 |
],
|
401 |
}
|
402 |
)
|
403 |
|
404 |
table_html_arx = arx_filter.to_html(index=False, border=0)
|
405 |
-
table_div_arx = Div(NotStr(table_html_arx)
|
406 |
|
407 |
s2o_filter = pd.DataFrame(
|
408 |
{
|
@@ -422,16 +425,16 @@ s2o_filter = pd.DataFrame(
|
|
422 |
"0.00%",
|
423 |
],
|
424 |
"Percent Removed After Local Dedup": [
|
425 |
-
"",
|
426 |
],
|
427 |
"Total Percentage Remaining": [
|
428 |
-
"%",
|
429 |
],
|
430 |
}
|
431 |
)
|
432 |
|
433 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
434 |
-
table_div_s2o = Div(NotStr(table_html_s2o)
|
435 |
|
436 |
med_filter = pd.DataFrame(
|
437 |
{
|
@@ -451,16 +454,16 @@ med_filter = pd.DataFrame(
|
|
451 |
"0.02%",
|
452 |
],
|
453 |
"Percent Removed After Local Dedup": [
|
454 |
-
"",
|
455 |
],
|
456 |
"Total Percentage Remaining": [
|
457 |
-
"%",
|
458 |
],
|
459 |
}
|
460 |
)
|
461 |
|
462 |
table_html_med = med_filter.to_html(index=False, border=0)
|
463 |
-
table_div_med = Div(NotStr(table_html_med)
|
464 |
|
465 |
phil_filter = pd.DataFrame(
|
466 |
{
|
@@ -480,16 +483,16 @@ phil_filter = pd.DataFrame(
|
|
480 |
"0.12%",
|
481 |
],
|
482 |
"Percent Removed After Local Dedup": [
|
483 |
-
"",
|
484 |
],
|
485 |
"Total Percentage Remaining": [
|
486 |
-
"%",
|
487 |
],
|
488 |
}
|
489 |
)
|
490 |
|
491 |
table_html_phil = phil_filter.to_html(index=False, border=0)
|
492 |
-
table_div_phil = Div(NotStr(table_html_phil)
|
493 |
## end individual tables showing filterin
|
494 |
|
495 |
|
@@ -681,24 +684,51 @@ filtering_process = Div(
|
|
681 |
P(
|
682 |
B("Download and Extraction: "),
|
683 |
"All the data was downloaded in original latex format from ArXiv official S3 repo: ",
|
684 |
-
A("s3://
|
685 |
-
". We
|
|
|
|
|
686 |
D_code(
|
687 |
-
"pandoc -s
|
688 |
-
language="
|
689 |
),
|
690 |
-
".
|
691 |
),
|
692 |
P(B("Unique Data Preparation Challenges: ")),
|
|
|
|
|
|
|
693 |
Ul(
|
694 |
Li(
|
695 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
696 |
style="margin-bottom: -3px",
|
697 |
),
|
698 |
),
|
699 |
P(
|
700 |
B(" Filters Applied: "),
|
701 |
-
"multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset",
|
|
|
702 |
),
|
703 |
Ul(
|
704 |
Li(
|
@@ -773,19 +803,19 @@ filtering_process = Div(
|
|
773 |
),
|
774 |
),
|
775 |
table_div_s2o,
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
),
|
790 |
),
|
791 |
Section(
|
@@ -825,19 +855,19 @@ filtering_process = Div(
|
|
825 |
style="margin-bottom: -3px",
|
826 |
),
|
827 |
),
|
828 |
-
Details(
|
829 |
-
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
)
|
842 |
),
|
843 |
Section(
|
@@ -851,13 +881,16 @@ filtering_process = Div(
|
|
851 |
href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",
|
852 |
),
|
853 |
". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc",
|
854 |
-
D_code(
|
855 |
-
|
|
|
|
|
|
|
856 |
),
|
857 |
P(B("Unique Data Preparation Challenges: ")),
|
858 |
Ul(
|
859 |
Li(
|
860 |
-
"
|
861 |
style="margin-bottom: -3px",
|
862 |
),
|
863 |
),
|
@@ -1584,7 +1617,8 @@ def curated():
|
|
1584 |
table_html = data_preparation_steps.to_html(index=False, border=0)
|
1585 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
1586 |
|
1587 |
-
text = P(
|
|
|
1588 |
process. Here, we focus on acquiring and extracting the raw data, which can
|
1589 |
come from various sources such as crawling websites, using HTTP/FTP dumps,
|
1590 |
or working with archive dumps. For instance, to download and prepare a
|
@@ -1594,7 +1628,8 @@ def curated():
|
|
1594 |
preparation process: It is worth noting that some pipelines might require
|
1595 |
invoking additional functions or scripts to handle specific data sources or
|
1596 |
formats. These helper scripts can be located within specific directories
|
1597 |
-
or modules dedicated to the dataset."""
|
|
|
1598 |
|
1599 |
return Div(
|
1600 |
Section(
|
|
|
24 |
"Individual Filtering Discussion for Each Source",
|
25 |
style="margin-bottom: 5px",
|
26 |
),
|
27 |
+
Li(B("Estimated Reading Time: 25 minutes"),style="margin-bottom: 5px", ),
|
28 |
),
|
29 |
),
|
30 |
)
|
|
|
34 |
P(
|
35 |
"Curated sources comprise high-quality datasets that contain domain-specificity.",
|
36 |
B(
|
37 |
+
" TxT360 was strongly influenced by The Pile",
|
38 |
+
D_cite(bibtex_key="thepile"),
|
39 |
+
" regarding both inclusion of the dataset and filtering techniques.",
|
40 |
),
|
41 |
" These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ",
|
42 |
),
|
|
|
132 |
"0.00%",
|
133 |
],
|
134 |
"Percent Removed After Local Dedup": [
|
135 |
+
"0.31%",
|
136 |
],
|
137 |
"Total Percentage Remaining": [
|
138 |
+
"97.84%",
|
139 |
],
|
140 |
}
|
141 |
)
|
142 |
|
143 |
table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
|
144 |
+
table_div_wikipedia = Div(NotStr(table_html_wikipedia))
|
145 |
|
146 |
freelaw_filter = pd.DataFrame(
|
147 |
{
|
|
|
170 |
)
|
171 |
|
172 |
table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
|
173 |
+
table_div_freelaw = Div(NotStr(table_html_freelaw))
|
174 |
|
175 |
dmm_filter = pd.DataFrame(
|
176 |
{
|
|
|
190 |
"0.00%",
|
191 |
],
|
192 |
"Percent Removed After Local Dedup": [
|
193 |
+
"0.00%",
|
194 |
],
|
195 |
"Total Percentage Remaining": [
|
196 |
+
"100.00%",
|
197 |
],
|
198 |
}
|
199 |
)
|
200 |
|
201 |
table_html_dmm = dmm_filter.to_html(index=False, border=0)
|
202 |
+
table_div_dmm = Div(NotStr(table_html_dmm))
|
203 |
|
204 |
|
205 |
uspto_filter = pd.DataFrame(
|
|
|
220 |
"0.01%",
|
221 |
],
|
222 |
"Percent Removed After Local Dedup": [
|
223 |
+
"22.94%",
|
224 |
],
|
225 |
"Total Percentage Remaining": [
|
226 |
+
"75.15%",
|
227 |
],
|
228 |
}
|
229 |
)
|
230 |
|
231 |
table_html_uspto = uspto_filter.to_html(index=False, border=0)
|
232 |
+
table_div_uspto = Div(NotStr(table_html_uspto))
|
233 |
|
234 |
pg19_filter = pd.DataFrame(
|
235 |
{
|
|
|
249 |
"0.17%",
|
250 |
],
|
251 |
"Percent Removed After Local Dedup": [
|
252 |
+
"0.80%",
|
253 |
],
|
254 |
"Total Percentage Remaining": [
|
255 |
+
"98.78%",
|
256 |
],
|
257 |
}
|
258 |
)
|
259 |
|
260 |
table_html_pg19 = pg19_filter.to_html(index=False, border=0)
|
261 |
+
table_div_pg19 = Div(NotStr(table_html_pg19))
|
262 |
|
263 |
|
264 |
hn_filter = pd.DataFrame(
|
|
|
270 |
"2064931",
|
271 |
],
|
272 |
"Percent Removed After Language Filter": [
|
273 |
+
"2.62%",
|
274 |
],
|
275 |
"Percent Removed After Min Word Count Filter": [
|
276 |
"0.02%",
|
|
|
279 |
"0.34%",
|
280 |
],
|
281 |
"Percent Removed After Local Dedup": [
|
282 |
+
"61.84%",
|
283 |
],
|
284 |
"Total Percentage Remaining": [
|
285 |
+
"35.18%",
|
286 |
],
|
287 |
}
|
288 |
)
|
289 |
|
290 |
table_html_hn = hn_filter.to_html(index=False, border=0)
|
291 |
+
table_div_hn = Div(NotStr(table_html_hn))
|
292 |
|
293 |
|
294 |
uirc_filter = pd.DataFrame(
|
|
|
309 |
"1.12%",
|
310 |
],
|
311 |
"Percent Removed After Local Dedup": [
|
312 |
+
"0.66%",
|
313 |
],
|
314 |
"Total Percentage Remaining": [
|
315 |
+
"59.98%",
|
316 |
],
|
317 |
}
|
318 |
)
|
319 |
|
320 |
table_html_uirc = uirc_filter.to_html(index=False, border=0)
|
321 |
+
table_div_uirc = Div(NotStr(table_html_uirc))
|
322 |
|
323 |
up_filter = pd.DataFrame(
|
324 |
{
|
|
|
338 |
"0.00%",
|
339 |
],
|
340 |
"Percent Removed After Local Dedup": [
|
341 |
+
"1.00%",
|
342 |
],
|
343 |
"Total Percentage Remaining": [
|
344 |
+
"99.00%",
|
345 |
],
|
346 |
}
|
347 |
)
|
348 |
|
349 |
table_html_up = up_filter.to_html(index=False, border=0)
|
350 |
+
table_div_up = Div(NotStr(table_html_up))
|
351 |
|
352 |
se_filter = pd.DataFrame(
|
353 |
{
|
|
|
367 |
"0.00%",
|
368 |
],
|
369 |
"Percent Removed After Local Dedup": [
|
370 |
+
"0.00%",
|
371 |
],
|
372 |
"Total Percentage Remaining": [
|
373 |
+
"100.00%",
|
374 |
],
|
375 |
}
|
376 |
)
|
377 |
|
378 |
table_html_se = se_filter.to_html(index=False, border=0)
|
379 |
+
table_div_se = Div(NotStr(table_html_se))
|
380 |
|
381 |
arx_filter = pd.DataFrame(
|
382 |
{
|
|
|
396 |
"0.07%",
|
397 |
],
|
398 |
"Percent Removed After Local Dedup": [
|
399 |
+
"0.00%",
|
400 |
],
|
401 |
"Total Percentage Remaining": [
|
402 |
+
"92.06%",
|
403 |
],
|
404 |
}
|
405 |
)
|
406 |
|
407 |
table_html_arx = arx_filter.to_html(index=False, border=0)
|
408 |
+
table_div_arx = Div(NotStr(table_html_arx))
|
409 |
|
410 |
s2o_filter = pd.DataFrame(
|
411 |
{
|
|
|
425 |
"0.00%",
|
426 |
],
|
427 |
"Percent Removed After Local Dedup": [
|
428 |
+
"0.00%",
|
429 |
],
|
430 |
"Total Percentage Remaining": [
|
431 |
+
"100.00%",
|
432 |
],
|
433 |
}
|
434 |
)
|
435 |
|
436 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
437 |
+
table_div_s2o = Div(NotStr(table_html_s2o))
|
438 |
|
439 |
med_filter = pd.DataFrame(
|
440 |
{
|
|
|
454 |
"0.02%",
|
455 |
],
|
456 |
"Percent Removed After Local Dedup": [
|
457 |
+
"0.00%",
|
458 |
],
|
459 |
"Total Percentage Remaining": [
|
460 |
+
"91.03%",
|
461 |
],
|
462 |
}
|
463 |
)
|
464 |
|
465 |
table_html_med = med_filter.to_html(index=False, border=0)
|
466 |
+
table_div_med = Div(NotStr(table_html_med))
|
467 |
|
468 |
phil_filter = pd.DataFrame(
|
469 |
{
|
|
|
483 |
"0.12%",
|
484 |
],
|
485 |
"Percent Removed After Local Dedup": [
|
486 |
+
"0.00%",
|
487 |
],
|
488 |
"Total Percentage Remaining": [
|
489 |
+
"79.20%",
|
490 |
],
|
491 |
}
|
492 |
)
|
493 |
|
494 |
table_html_phil = phil_filter.to_html(index=False, border=0)
|
495 |
+
table_div_phil = Div(NotStr(table_html_phil))
|
496 |
## end individual tables showing filterin
|
497 |
|
498 |
|
|
|
684 |
P(
|
685 |
B("Download and Extraction: "),
|
686 |
"All the data was downloaded in original latex format from ArXiv official S3 repo: ",
|
687 |
+
A("s3://arxiv/src", href="s3://arxiv/src"),
|
688 |
+
". We aim to encode the downloaded data in UTF-8 format, and when necessary, utilize the chardet library to infer the appropriate encoding. After that, we use ",
|
689 |
+
A("Pandoc", href="https://pandoc.org/"),
|
690 |
+
" to extract information from the latex files into markdown format. The command we use is",
|
691 |
D_code(
|
692 |
+
"pandoc <raw_tex_path> -s -o <output_markdown_path> -f latex+raw_tex -t markdown_mmd [--lua-filter <lua_filter_path>]",
|
693 |
+
language="bash",
|
694 |
),
|
695 |
+
". Finally, all markdowns were combined to create jsonl files.",
|
696 |
),
|
697 |
P(B("Unique Data Preparation Challenges: ")),
|
698 |
+
P(
|
699 |
+
"When converting LaTeX files into Markdown using Pandoc, it is crucial to account for different data formats to minimize information loss while also filtering out noisy content in LaTeX. Below, we outline our considerations and methods for handling various data types during this conversion process:"
|
700 |
+
),
|
701 |
Ul(
|
702 |
Li(
|
703 |
+
B("Tables: "),
|
704 |
+
"The process for handling tables follows three main approaches. First, tables compatible with Pandoc’s built-in formats are directly converted into standard Markdown tables. Notably, LaTeX’s '\\multicolumn' and '\\multirow' commands can be successfully translated into valid Markdown tables. Second, tables unsupported by Pandoc’s native functionality, such as deluxetable or other complex LaTeX types, are preserved in their original LaTeX format to maintain the integrity of complex structures. Third, only a few remaining tables have been converted to HTML web tables.",
|
705 |
+
style="margin-bottom: -3px",
|
706 |
+
),
|
707 |
+
Li(
|
708 |
+
B("Mathematical Expressions: "),
|
709 |
+
"Inline mathematical expressions are rendered in Markdown. More complex equations remain unchanged, e.g., presented as '\\begin{aligned}' blocks, to ensure accuracy and readability.",
|
710 |
+
style="margin-bottom: -3px",
|
711 |
+
),
|
712 |
+
Li(
|
713 |
+
B("Figures: "),
|
714 |
+
"All figures are removed during the conversion process. Placeholder figures might not contribute to the paper’s data quality and, as such, have been omitted to streamline the output.",
|
715 |
+
style="margin-bottom: -3px",
|
716 |
+
),
|
717 |
+
Li(
|
718 |
+
B("Section Headers: "),
|
719 |
+
"Section headers are converted into markdown format, using leading '#' symbols to represent the heading levels.",
|
720 |
+
style="margin-bottom: -3px",
|
721 |
+
),
|
722 |
+
Li(
|
723 |
+
B("References: "),
|
724 |
+
"References are removed. Although they may be informative, references often introduce formatting inconsistencies or add little value compared to the core content of the paper.",
|
725 |
style="margin-bottom: -3px",
|
726 |
),
|
727 |
),
|
728 |
P(
|
729 |
B(" Filters Applied: "),
|
730 |
+
"multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset",
|
731 |
+
D_cite(bibtex_key="peS2o"),
|
732 |
),
|
733 |
Ul(
|
734 |
Li(
|
|
|
803 |
),
|
804 |
),
|
805 |
table_div_s2o,
|
806 |
+
# Details(
|
807 |
+
# Summary("S2ORC Filtering Examples -- need to update"),
|
808 |
+
# Div(
|
809 |
+
# P("examples are missing"),
|
810 |
+
# style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; ", # Styling for the DV2 part
|
811 |
+
# ),
|
812 |
+
# style="""
|
813 |
+
# background-color: #FFFAEA; /* Light yellow background */
|
814 |
+
# padding: 15px;
|
815 |
+
# border-radius: 12px;
|
816 |
+
# margin-bottom: 15px
|
817 |
+
# """,
|
818 |
+
# ),
|
819 |
),
|
820 |
),
|
821 |
Section(
|
|
|
855 |
style="margin-bottom: -3px",
|
856 |
),
|
857 |
),
|
858 |
+
#Details(
|
859 |
+
# Summary("S2ORC Abstract Filtering Examples "),
|
860 |
+
# Div(
|
861 |
+
# P("examples are missing"),
|
862 |
+
# style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; ", # Styling for the DV2 part
|
863 |
+
# ),
|
864 |
+
# style="""
|
865 |
+
# background-color: #FFFAEA; /* Light yellow background */
|
866 |
+
# padding: 15px;
|
867 |
+
# border-radius: 12px;
|
868 |
+
# margin-bottom: 15px
|
869 |
+
# """,
|
870 |
+
# ),
|
871 |
)
|
872 |
),
|
873 |
Section(
|
|
|
881 |
href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",
|
882 |
),
|
883 |
". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc",
|
884 |
+
D_code(
|
885 |
+
"pandoc <raw_xml_path> -s -o <output_markdown_path> -f jats -t markdown_mmd [--lua-filter <lua_filter_path>]",
|
886 |
+
language="bash",
|
887 |
+
),
|
888 |
+
". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
|
889 |
),
|
890 |
P(B("Unique Data Preparation Challenges: ")),
|
891 |
Ul(
|
892 |
Li(
|
893 |
+
"We tried similar attempts on PMC as we did on ArXiv. The resulted markdown might have slight difference due to the different structure of the XML files.",
|
894 |
style="margin-bottom: -3px",
|
895 |
),
|
896 |
),
|
|
|
1617 |
table_html = data_preparation_steps.to_html(index=False, border=0)
|
1618 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
1619 |
|
1620 |
+
text = P(
|
1621 |
+
"""This initial stage serves as the foundation for the entire
|
1622 |
process. Here, we focus on acquiring and extracting the raw data, which can
|
1623 |
come from various sources such as crawling websites, using HTTP/FTP dumps,
|
1624 |
or working with archive dumps. For instance, to download and prepare a
|
|
|
1628 |
preparation process: It is worth noting that some pipelines might require
|
1629 |
invoking additional functions or scripts to handle specific data sources or
|
1630 |
formats. These helper scripts can be located within specific directories
|
1631 |
+
or modules dedicated to the dataset."""
|
1632 |
+
)
|
1633 |
|
1634 |
return Div(
|
1635 |
Section(
|
data/topic_charts.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
main.py
CHANGED
@@ -175,7 +175,7 @@ def main():
|
|
175 |
Div(
|
176 |
A(
|
177 |
"TxT360",
|
178 |
-
href="#
|
179 |
)
|
180 |
),
|
181 |
Div(
|
@@ -352,6 +352,12 @@ def main():
|
|
352 |
href="#section53",
|
353 |
)
|
354 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
),
|
356 |
),
|
357 |
role="navigation",
|
@@ -359,8 +365,8 @@ def main():
|
|
359 |
),
|
360 |
),
|
361 |
intro(),
|
362 |
-
curated.curated(),
|
363 |
web.web_data(),
|
|
|
364 |
common.common_steps(),
|
365 |
results.results(),
|
366 |
),
|
@@ -757,7 +763,7 @@ dataset_sources = pd.DataFrame(
|
|
757 |
"StackExchange",
|
758 |
],
|
759 |
"Raw Data Size": [
|
760 |
-
"
|
761 |
"712 GB",
|
762 |
"210 GB",
|
763 |
"23 GB",
|
@@ -770,7 +776,7 @@ dataset_sources = pd.DataFrame(
|
|
770 |
"45 GB",
|
771 |
],
|
772 |
"Token Count": [
|
773 |
-
"
|
774 |
"154.96B",
|
775 |
"4.75B",
|
776 |
"7.34B",
|
|
|
175 |
Div(
|
176 |
A(
|
177 |
"TxT360",
|
178 |
+
href="#section11",
|
179 |
)
|
180 |
),
|
181 |
Div(
|
|
|
352 |
href="#section53",
|
353 |
)
|
354 |
),
|
355 |
+
Li(
|
356 |
+
A(
|
357 |
+
"Topic Analysis",
|
358 |
+
href="#section55",
|
359 |
+
)
|
360 |
+
)
|
361 |
),
|
362 |
),
|
363 |
role="navigation",
|
|
|
365 |
),
|
366 |
),
|
367 |
intro(),
|
|
|
368 |
web.web_data(),
|
369 |
+
curated.curated(),
|
370 |
common.common_steps(),
|
371 |
results.results(),
|
372 |
),
|
|
|
763 |
"StackExchange",
|
764 |
],
|
765 |
"Raw Data Size": [
|
766 |
+
"9.2 TB",
|
767 |
"712 GB",
|
768 |
"210 GB",
|
769 |
"23 GB",
|
|
|
776 |
"45 GB",
|
777 |
],
|
778 |
"Token Count": [
|
779 |
+
"4.83T",
|
780 |
"154.96B",
|
781 |
"4.75B",
|
782 |
"7.34B",
|
results.py
CHANGED
@@ -157,81 +157,69 @@ fig_loss.update_layout(
|
|
157 |
lm_loss_graph = fig_loss
|
158 |
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
["2019", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.818363305107052, 16.474269837858706, 14.944741674400241, 14.568394784374943, 14.690158822673334, 15.990949424635108]]],
|
169 |
-
["2020", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.98821894111693, 15.936494557783181, 14.79960386342691, 14.435682562274105, 14.58651834886038, 15.869365567783806]]],
|
170 |
-
["2021", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.125795647512877, 15.780419457145868, 14.631430892394002, 14.276477514399625, 14.337146941773641, 15.872474774329305]]],
|
171 |
-
["2022", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.573462144306383, 15.283018703313582, 14.378277745163881, 14.0611924390084, 13.9886330091318, 15.769421394877273]]],
|
172 |
-
["2023", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [15.4293630385597, 14.608379914730168, 14.118271697056592, 13.880215644749589, 13.767106666731275, 15.05749135510839]]]
|
173 |
-
]
|
174 |
-
|
175 |
-
# Extract ranges (buckets) and years
|
176 |
-
ranges = DATA[0][1][0]
|
177 |
-
years = [year_data[0] for year_data in DATA]
|
178 |
-
all_values = [year_data[1][1] for year_data in DATA]
|
179 |
|
180 |
-
#
|
181 |
-
|
182 |
|
183 |
-
#
|
184 |
-
|
185 |
-
values = all_values[i]
|
186 |
-
fig.add_trace(go.Scatter(x=ranges, y=values, mode='lines+markers', name=year))
|
187 |
|
188 |
-
#
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
195 |
)
|
196 |
|
197 |
-
Perplexity_Across_Different_Buckets_global_graph =
|
198 |
|
199 |
##graph 2
|
200 |
|
201 |
# Data
|
202 |
-
years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
|
203 |
-
buckets = ["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"]
|
204 |
data = {
|
205 |
-
"
|
206 |
-
"
|
207 |
-
"
|
208 |
-
"
|
209 |
-
"
|
210 |
-
"
|
211 |
-
"2020": [16.98821894111693, 15.936494557783181, 14.79960386342691, 14.435682562274105, 14.58651834886038, 15.869365567783806],
|
212 |
-
"2021": [17.125795647512877, 15.780419457145868, 14.631430892394002, 14.276477514399625, 14.337146941773641, 15.872474774329305],
|
213 |
-
"2022": [16.573462144306383, 15.283018703313582, 14.378277745163881, 14.0611924390084, 13.9886330091318, 15.769421394877273],
|
214 |
-
"2023": [15.4293630385597, 14.608379914730168, 14.118271697056592, 13.880215644749589, 13.767106666731275, 15.05749135510839]
|
215 |
}
|
216 |
-
|
217 |
# Create a line plot for each bucket
|
218 |
-
|
|
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
223 |
|
224 |
# Update layout
|
225 |
-
|
226 |
-
title="
|
227 |
xaxis_title="Year",
|
228 |
yaxis_title="Average Perplexity",
|
229 |
-
legend_title="
|
230 |
-
hovermode="x unified"
|
231 |
)
|
232 |
|
233 |
-
# Show
|
234 |
-
|
|
|
235 |
|
236 |
#graph 3 tbd
|
237 |
|
@@ -277,8 +265,11 @@ data = {
|
|
277 |
"y": [15.4293630385597, 14.827776633211421, 14.600432832118155, 14.43330043760322, 14.30586483681026, 14.203397641081045, 14.140361413924607, 14.101673126860582, 14.05840021694595, 14.033693337279875, 13.990003714671388, 14.001106927608756, 13.982387676044238, 13.960424890216352, 13.941694305252629, 13.928958405693843, 13.922871327026984, 13.90749356497257, 13.883187320364065, 13.870538613453949, 13.853682922141118, 13.839326154723096, 13.841949693311191, 13.851284862386178, 13.853606323578846, 13.851840301257587, 13.86166226046842, 13.872513892742713, 13.867295530090015, 13.870385212514083, 13.868578068850889, 13.848685425568009, 13.838736750620761, 13.825182992628129, 13.795340100698754, 13.809629502148145, 13.767911089744738, 13.75599639262174, 13.75925151191706, 13.751975433642748, 13.731931931502134, 13.714316407794309, 13.730848265421725, 13.681606796845, 13.670846152397202, 13.685040324991581, 13.687292733648798, 13.645627677729081, 13.635021438002346, 13.670103747374988, 13.62246956240464, 13.658574692934657, 13.65219324836813, 13.663836335892329, 13.666948307267594, 13.642643510414398, 13.672961601819406, 13.663525877548398, 13.667436573958156, 13.687522639036205, 13.647544546546508, 13.670704172336292, 13.643000490240736, 13.667107110344569, 13.660218070172265, 13.675550822990397, 13.59092599672469, 13.619850375757148, 13.651551988902462, 13.684288098100867, 13.599046453546292, 13.580905963181452, 13.620956017533885, 13.648408527460056, 13.60249514150298, 13.568078301644128, 13.6145797181229, 13.647719674647586, 13.531493703586886, 13.609260600121146, 13.687447710013837, 13.602215210547463, 13.563220813507392, 13.57176728376717, 13.664222431811334, 13.586259696651297, 13.571127927324502, 13.692682818016978, 13.599437831977406, 13.646024625160337, 13.645231022004468, 13.582760709524955, 13.597408008025921, 13.640791164279545, 13.58016693386471, 13.664139165839629, 13.553901320176095, 13.616471379535914, 13.574418885444663, 13.62049913139043, 13.595296717779055, 13.618520397945241, 13.619083853308746, 13.549404239296155, 13.515187660214737, 13.728226318739061, 13.618609127026419, 13.638021829042664, 13.684199453984574, 13.707676555543845, 13.541617144639595, 13.703834138276244, 13.519323579997998, 13.6031555746482, 13.56414632339735, 13.711610914557605, 13.529701508251849, 13.631734977501925, 13.657120297408555, 13.58489249685537, 13.64403630157245, 13.608235747585796, 13.512110498832085, 13.846959297419959, 13.488917791046676, 13.696834245583917, 13.61507766454769, 13.565377802829648, 13.790197786574588, 13.564346823139237, 13.5342157764538, 13.65148446769962, 13.570646849759997, 13.538878864721305, 13.608980817839505, 13.664477237928127, 13.532411719398455, 13.737437273256564, 13.519816331488395, 13.764368990604515, 13.653588472445573, 13.5302540795639, 13.57561188810073, 13.647608383912683, 13.728643890026184, 13.710097008284352, 13.75589344505474, 13.682502650080487, 13.684339202206958, 13.86749342599449, 13.645786331176726, 13.660817039716257, 13.638011389586667, 13.732387353692099, 13.687686391049773, 13.94728981311995, 13.614547412507926, 13.460552695752389, 13.857074456574574, 13.660326212827487, 13.718393121459215, 13.76973846040823, 13.778762864123724, 13.656959991670522, 14.214462072816556, 13.622739420457773, 13.645227120727897, 13.687497326712435, 13.649272554141808, 13.503550000252996, 14.052494437028535, 13.580311058963396, 14.14017314736784, 13.64182000911326, 13.886936632170794, 13.942674178813384, 13.801754161915504, 13.931144818346747, 13.584651322907064, 13.960391830857033, 14.178873989248789, 13.645923431288137, 13.97423140921055, 13.669569353257327, 13.915331700422486, 13.827145087821776, 13.736836827537942, 13.74766543270326, 13.770318536193258, 13.721033560006754, 13.710191655241298, 13.72744974500201, 13.845900599098753, 13.678446729208007, 13.902643304888189, 13.885717101405227, 13.800092557910519, 13.421930828385424, 14.22006344809192, 13.879986040380091, 14.134020605148754, 13.828712030750555, 13.713190050100788, 13.670401789346878, 13.948423234333882, 13.84481181587886, 13.818908981670926, 13.74582730003707, 13.94097096205736, 14.180988587482524, 13.792218775776533, 13.684855627228844, 13.881739174231223, 13.998746647008945, 14.576445279868192, 13.977974630190932, 13.537548783758675, 13.793279757360363, 13.712674433009878, 13.758696209000801, 13.912382937440464, 14.340144810498328, 14.03279240397904, 13.889996587251982, 13.862065986515601, 13.712302898403514, 13.797613183843772, 13.847295441275936, 14.422108283887903, 13.900169262027397, 14.247783256416513, 14.02056653491548, 13.99526374998638, 13.69358071050138, 13.828640744267325, 14.01321596237364, 13.824657271537166, 13.683602248925892, 13.933987455146445, 13.993921243916123, 13.720966315955001, 13.890770809341586, 13.426961283787957, 13.789886238259456, 13.904131224083592, 13.697354217113691, 13.710497864728065, 14.683006531539148, 14.315759264484964, 14.034699751802846, 13.890948305458895, 14.038907287423818, 13.298757873288494, 14.037824844116592, 14.162250808384407, 13.947058195824253, 13.806653590867487, 13.980386450979841, 13.667273172120268, 14.201737731565034, 14.447849496372461, 14.018697147254176, 14.002315381014313, 14.15231239942244, 13.73719042552595, 14.033594655653038, 14.03087108296539, 13.782578520604822, 13.660908086958573, 14.205910336717215, 13.930322566343703, 14.300591211697906, 13.808749910964323, 13.8076208806989, 14.169819824542373, 14.067537576689853, 14.306124453553517, 13.56827905088618, 13.592634469105525, 13.789396713937975, 13.853464456609156, 13.855768885291056, 13.710464058237172, 13.641314331273772, 14.295528532894258, 14.04977108393012, 14.153384726345907, 13.385524853965638, 13.484650040422745, 13.831061312642468, 14.15298734516191, 14.20289531849528, 14.116529526532855, 13.767652893233096, 13.715924776449405, 14.649027392365024, 13.603310028324861, 14.10228343978809, 13.704762155010586, 13.731715455443299],
|
278 |
},
|
279 |
}
|
|
|
|
|
|
|
280 |
for year, year_data in data.items():
|
281 |
-
|
282 |
x=year_data['x'],
|
283 |
y=year_data['y'],
|
284 |
mode='lines+markers',
|
@@ -287,7 +278,7 @@ for year, year_data in data.items():
|
|
287 |
))
|
288 |
|
289 |
# Update layout
|
290 |
-
|
291 |
title="Perplexity vs. Number of Duplicate Documents Over Years",
|
292 |
xaxis_title="Number of Duplicate Documents",
|
293 |
yaxis_title="Average Perplexity",
|
@@ -295,7 +286,7 @@ fig.update_layout(
|
|
295 |
hovermode="x unified"
|
296 |
)
|
297 |
|
298 |
-
graph3 =
|
299 |
|
300 |
##graph 4
|
301 |
|
@@ -839,6 +830,8 @@ intro_div = Div(
|
|
839 |
Ul(
|
840 |
Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
|
841 |
Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
|
|
|
|
|
842 |
),
|
843 |
)
|
844 |
|
@@ -910,16 +903,15 @@ perp1_div = Div(
|
|
910 |
Section(
|
911 |
H3("Perplexity vs Years"),
|
912 |
P("Taking the same data, we can convert it into a graph indicating the yearly trend. For most buckets, the average perplexity of dumps from more recent years seem to be lower than that of former years."),
|
913 |
-
Img(src="images/prep-across-diff-year-global-dup-buckets.png", height = "300", width = "600" ),
|
914 |
-
plotly2fasthtml(
|
915 |
-
|
916 |
),
|
917 |
Section(
|
918 |
H3("Perplexity vs Document Duplication"),
|
919 |
P("We can also break each bucket into distinct document counts. The graph becomes a bit noisy at the end because of insufficient samples with larger duplication counts."),
|
920 |
-
Img(src="images/prep-across-diff-docs-dup-count-global.png", height = "300", width = "600" ),
|
921 |
plotly2fasthtml(graph3),
|
922 |
-
P("NEED TO UPDATE - THIS GRAPH SHOULD MATCH THE IMAGE ABOVE AND BUCKET SHOULD NOT BE a LINE OPTION"),
|
923 |
),
|
924 |
Section(
|
925 |
H3("Perplexity vs Dump Duplication"),
|
@@ -974,6 +966,51 @@ llama_div = Div(
|
|
974 |
),
|
975 |
)
|
976 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
977 |
|
978 |
def results():
|
979 |
return Div(
|
@@ -995,6 +1032,10 @@ def results():
|
|
995 |
),
|
996 |
Section(
|
997 |
llama_div,
|
|
|
|
|
|
|
|
|
998 |
),
|
999 |
id="inner-text"
|
1000 |
)
|
|
|
157 |
lm_loss_graph = fig_loss
|
158 |
|
159 |
|
160 |
+
data = {
|
161 |
+
"1-1": [17.410227605477868, 17.446573602753478,17.307221780905284,17.338525603992114,17.08551151136689,16.818363305107052,16.98821894111693, 17.125795647512877,16.573462144306383, 15.4293630385597],
|
162 |
+
"2-5": [16.11176217183986,16.14852530113782,16.297702171159543,15.960924352297502,16.187802102106698,16.474269837858706,15.936494557783181, 15.780419457145868,15.283018703313582, 14.608379914730168],
|
163 |
+
"6-10": [15.632757662414805,15.627408549576069,15.948641884223639,15.912187993988933,14.935072408852303,14.944741674400241,14.79960386342691,14.631430892394002, 14.378277745163881,14.118271697056592],
|
164 |
+
"11-100": [15.446116676532212,15.0055028132117,14.799690714225637,14.822102470001267,14.832038213200583,14.568394784374943,14.435682562274105,14.276477514399625, 14.0611924390084,13.880215644749589],
|
165 |
+
"101-1000": [16.716943171826703,15.565430373421485,14.935989931859659,14.778913482337416,14.508674264491997,14.690158822673334,14.58651834886038,14.337146941773641,13.9886330091318,13.767106666731275 ],
|
166 |
+
"1001-30000000": [18.156821563322765,17.314701050452452,16.09585768919658,15.428145290012955, 14.800605964649103, 15.990949424635108,15.869365567783806,15.872474774329305, 15.769421394877273, 15.05749135510839],
|
167 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
# Years for the x-axis
|
170 |
+
years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
|
171 |
|
172 |
+
# Create a plotly figure
|
173 |
+
fig11 = go.Figure()
|
|
|
|
|
174 |
|
175 |
+
# Add a trace for each bucket
|
176 |
+
for bucket, values in data.items():
|
177 |
+
fig11.add_trace(go.Scatter(x=years, y=values, mode='lines', name=bucket))
|
178 |
+
|
179 |
+
# Update layout for better presentation
|
180 |
+
fig11.update_layout(
|
181 |
+
title='Perplexity Versus Buckets for Different Years',
|
182 |
+
xaxis_title='Year',
|
183 |
+
yaxis_title='Perplexity',
|
184 |
+
xaxis_tickangle=-45,
|
185 |
+
legend_title="Buckets",
|
186 |
)
|
187 |
|
188 |
+
Perplexity_Across_Different_Buckets_global_graph = fig11
|
189 |
|
190 |
##graph 2
|
191 |
|
192 |
# Data
|
|
|
|
|
193 |
data = {
|
194 |
+
"1-1": [17.410227605477868, 17.446573602753478,17.307221780905284,17.338525603992114,17.08551151136689,16.818363305107052,16.98821894111693, 17.125795647512877,16.573462144306383, 15.4293630385597],
|
195 |
+
"2-5": [16.11176217183986,16.14852530113782,16.297702171159543,15.960924352297502,16.187802102106698,16.474269837858706,15.936494557783181, 15.780419457145868,15.283018703313582, 14.608379914730168],
|
196 |
+
"6-10": [15.632757662414805,15.627408549576069,15.948641884223639,15.912187993988933,14.935072408852303,14.944741674400241,14.79960386342691,14.631430892394002, 14.378277745163881,14.118271697056592],
|
197 |
+
"11-100": [15.446116676532212,15.0055028132117,14.799690714225637,14.822102470001267,14.832038213200583,14.568394784374943,14.435682562274105,14.276477514399625, 14.0611924390084,13.880215644749589],
|
198 |
+
"101-1000": [16.716943171826703,15.565430373421485,14.935989931859659,14.778913482337416,14.508674264491997,14.690158822673334,14.58651834886038,14.337146941773641,13.9886330091318,13.767106666731275 ],
|
199 |
+
"1001-30000000": [18.156821563322765,17.314701050452452,16.09585768919658,15.428145290012955, 14.800605964649103, 15.990949424635108,15.869365567783806,15.872474774329305, 15.769421394877273, 15.05749135510839],
|
|
|
|
|
|
|
|
|
200 |
}
|
|
|
201 |
# Create a line plot for each bucket
|
202 |
+
# Years
|
203 |
+
years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
|
204 |
|
205 |
+
# Create the figure
|
206 |
+
fig22 = go.Figure()
|
207 |
+
|
208 |
+
# Add lines for each bucket
|
209 |
+
for bucket, perplexities in data.items():
|
210 |
+
fig22.add_trace(go.Scatter(x=years, y=perplexities, mode='lines+markers', name=bucket))
|
211 |
|
212 |
# Update layout
|
213 |
+
fig22.update_layout(
|
214 |
+
title="Perplexity Across Different Years (Global)",
|
215 |
xaxis_title="Year",
|
216 |
yaxis_title="Average Perplexity",
|
217 |
+
legend_title="Bucket (duplicate count range)"
|
|
|
218 |
)
|
219 |
|
220 |
+
# Show the figure
|
221 |
+
graph2222 = fig22
|
222 |
+
|
223 |
|
224 |
#graph 3 tbd
|
225 |
|
|
|
265 |
"y": [15.4293630385597, 14.827776633211421, 14.600432832118155, 14.43330043760322, 14.30586483681026, 14.203397641081045, 14.140361413924607, 14.101673126860582, 14.05840021694595, 14.033693337279875, 13.990003714671388, 14.001106927608756, 13.982387676044238, 13.960424890216352, 13.941694305252629, 13.928958405693843, 13.922871327026984, 13.90749356497257, 13.883187320364065, 13.870538613453949, 13.853682922141118, 13.839326154723096, 13.841949693311191, 13.851284862386178, 13.853606323578846, 13.851840301257587, 13.86166226046842, 13.872513892742713, 13.867295530090015, 13.870385212514083, 13.868578068850889, 13.848685425568009, 13.838736750620761, 13.825182992628129, 13.795340100698754, 13.809629502148145, 13.767911089744738, 13.75599639262174, 13.75925151191706, 13.751975433642748, 13.731931931502134, 13.714316407794309, 13.730848265421725, 13.681606796845, 13.670846152397202, 13.685040324991581, 13.687292733648798, 13.645627677729081, 13.635021438002346, 13.670103747374988, 13.62246956240464, 13.658574692934657, 13.65219324836813, 13.663836335892329, 13.666948307267594, 13.642643510414398, 13.672961601819406, 13.663525877548398, 13.667436573958156, 13.687522639036205, 13.647544546546508, 13.670704172336292, 13.643000490240736, 13.667107110344569, 13.660218070172265, 13.675550822990397, 13.59092599672469, 13.619850375757148, 13.651551988902462, 13.684288098100867, 13.599046453546292, 13.580905963181452, 13.620956017533885, 13.648408527460056, 13.60249514150298, 13.568078301644128, 13.6145797181229, 13.647719674647586, 13.531493703586886, 13.609260600121146, 13.687447710013837, 13.602215210547463, 13.563220813507392, 13.57176728376717, 13.664222431811334, 13.586259696651297, 13.571127927324502, 13.692682818016978, 13.599437831977406, 13.646024625160337, 13.645231022004468, 13.582760709524955, 13.597408008025921, 13.640791164279545, 13.58016693386471, 13.664139165839629, 13.553901320176095, 13.616471379535914, 13.574418885444663, 13.62049913139043, 13.595296717779055, 13.618520397945241, 13.619083853308746, 13.549404239296155, 13.515187660214737, 13.728226318739061, 13.618609127026419, 13.638021829042664, 13.684199453984574, 13.707676555543845, 13.541617144639595, 13.703834138276244, 13.519323579997998, 13.6031555746482, 13.56414632339735, 13.711610914557605, 13.529701508251849, 13.631734977501925, 13.657120297408555, 13.58489249685537, 13.64403630157245, 13.608235747585796, 13.512110498832085, 13.846959297419959, 13.488917791046676, 13.696834245583917, 13.61507766454769, 13.565377802829648, 13.790197786574588, 13.564346823139237, 13.5342157764538, 13.65148446769962, 13.570646849759997, 13.538878864721305, 13.608980817839505, 13.664477237928127, 13.532411719398455, 13.737437273256564, 13.519816331488395, 13.764368990604515, 13.653588472445573, 13.5302540795639, 13.57561188810073, 13.647608383912683, 13.728643890026184, 13.710097008284352, 13.75589344505474, 13.682502650080487, 13.684339202206958, 13.86749342599449, 13.645786331176726, 13.660817039716257, 13.638011389586667, 13.732387353692099, 13.687686391049773, 13.94728981311995, 13.614547412507926, 13.460552695752389, 13.857074456574574, 13.660326212827487, 13.718393121459215, 13.76973846040823, 13.778762864123724, 13.656959991670522, 14.214462072816556, 13.622739420457773, 13.645227120727897, 13.687497326712435, 13.649272554141808, 13.503550000252996, 14.052494437028535, 13.580311058963396, 14.14017314736784, 13.64182000911326, 13.886936632170794, 13.942674178813384, 13.801754161915504, 13.931144818346747, 13.584651322907064, 13.960391830857033, 14.178873989248789, 13.645923431288137, 13.97423140921055, 13.669569353257327, 13.915331700422486, 13.827145087821776, 13.736836827537942, 13.74766543270326, 13.770318536193258, 13.721033560006754, 13.710191655241298, 13.72744974500201, 13.845900599098753, 13.678446729208007, 13.902643304888189, 13.885717101405227, 13.800092557910519, 13.421930828385424, 14.22006344809192, 13.879986040380091, 14.134020605148754, 13.828712030750555, 13.713190050100788, 13.670401789346878, 13.948423234333882, 13.84481181587886, 13.818908981670926, 13.74582730003707, 13.94097096205736, 14.180988587482524, 13.792218775776533, 13.684855627228844, 13.881739174231223, 13.998746647008945, 14.576445279868192, 13.977974630190932, 13.537548783758675, 13.793279757360363, 13.712674433009878, 13.758696209000801, 13.912382937440464, 14.340144810498328, 14.03279240397904, 13.889996587251982, 13.862065986515601, 13.712302898403514, 13.797613183843772, 13.847295441275936, 14.422108283887903, 13.900169262027397, 14.247783256416513, 14.02056653491548, 13.99526374998638, 13.69358071050138, 13.828640744267325, 14.01321596237364, 13.824657271537166, 13.683602248925892, 13.933987455146445, 13.993921243916123, 13.720966315955001, 13.890770809341586, 13.426961283787957, 13.789886238259456, 13.904131224083592, 13.697354217113691, 13.710497864728065, 14.683006531539148, 14.315759264484964, 14.034699751802846, 13.890948305458895, 14.038907287423818, 13.298757873288494, 14.037824844116592, 14.162250808384407, 13.947058195824253, 13.806653590867487, 13.980386450979841, 13.667273172120268, 14.201737731565034, 14.447849496372461, 14.018697147254176, 14.002315381014313, 14.15231239942244, 13.73719042552595, 14.033594655653038, 14.03087108296539, 13.782578520604822, 13.660908086958573, 14.205910336717215, 13.930322566343703, 14.300591211697906, 13.808749910964323, 13.8076208806989, 14.169819824542373, 14.067537576689853, 14.306124453553517, 13.56827905088618, 13.592634469105525, 13.789396713937975, 13.853464456609156, 13.855768885291056, 13.710464058237172, 13.641314331273772, 14.295528532894258, 14.04977108393012, 14.153384726345907, 13.385524853965638, 13.484650040422745, 13.831061312642468, 14.15298734516191, 14.20289531849528, 14.116529526532855, 13.767652893233096, 13.715924776449405, 14.649027392365024, 13.603310028324861, 14.10228343978809, 13.704762155010586, 13.731715455443299],
|
266 |
},
|
267 |
}
|
268 |
+
|
269 |
+
fig33 = go.Figure()
|
270 |
+
|
271 |
for year, year_data in data.items():
|
272 |
+
fig33.add_trace(go.Scatter(
|
273 |
x=year_data['x'],
|
274 |
y=year_data['y'],
|
275 |
mode='lines+markers',
|
|
|
278 |
))
|
279 |
|
280 |
# Update layout
|
281 |
+
fig33.update_layout(
|
282 |
title="Perplexity vs. Number of Duplicate Documents Over Years",
|
283 |
xaxis_title="Number of Duplicate Documents",
|
284 |
yaxis_title="Average Perplexity",
|
|
|
286 |
hovermode="x unified"
|
287 |
)
|
288 |
|
289 |
+
graph3 = fig33
|
290 |
|
291 |
##graph 4
|
292 |
|
|
|
830 |
Ul(
|
831 |
Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
|
832 |
Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
|
833 |
+
Li("Topic Analysis on Data Cluster Groups", style = "margin-bottom: 5px"),
|
834 |
+
Li(B("Estimated Reading Time: 15 minutes"), style = "margin-bottom: 5px"),
|
835 |
),
|
836 |
)
|
837 |
|
|
|
903 |
Section(
|
904 |
H3("Perplexity vs Years"),
|
905 |
P("Taking the same data, we can convert it into a graph indicating the yearly trend. For most buckets, the average perplexity of dumps from more recent years seem to be lower than that of former years."),
|
906 |
+
#Img(src="images/prep-across-diff-year-global-dup-buckets.png", height = "300", width = "600" ),
|
907 |
+
plotly2fasthtml(graph2222),
|
908 |
+
|
909 |
),
|
910 |
Section(
|
911 |
H3("Perplexity vs Document Duplication"),
|
912 |
P("We can also break each bucket into distinct document counts. The graph becomes a bit noisy at the end because of insufficient samples with larger duplication counts."),
|
913 |
+
#Img(src="images/prep-across-diff-docs-dup-count-global.png", height = "300", width = "600" ),
|
914 |
plotly2fasthtml(graph3),
|
|
|
915 |
),
|
916 |
Section(
|
917 |
H3("Perplexity vs Dump Duplication"),
|
|
|
966 |
),
|
967 |
)
|
968 |
|
969 |
+
with open(os.path.join(os.path.dirname(__file__), "data", "topic_charts.json"), 'r') as f:
|
970 |
+
topic_charts = json.load(f)
|
971 |
+
topic_graphs = []
|
972 |
+
|
973 |
+
for title, data in topic_charts:
|
974 |
+
if data["type"] == "barh":
|
975 |
+
topic_graphs.append(go.Figure(go.Bar(
|
976 |
+
x=data["kwargs"]["width"],
|
977 |
+
y=data["kwargs"]['y'],
|
978 |
+
orientation='h',
|
979 |
+
marker_color=[
|
980 |
+
"rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
|
981 |
+
for rgb in data["kwargs"]["color"]
|
982 |
+
]
|
983 |
+
)))
|
984 |
+
elif data["type"] == "pie":
|
985 |
+
topic_graphs.append(go.Figure(go.Pie(
|
986 |
+
values=data["kwargs"]['x'],
|
987 |
+
labels=data["kwargs"]["labels"],
|
988 |
+
marker_colors=[
|
989 |
+
"rgb(" + ", ".join(str(val * 255) for val in rgb) + ')'
|
990 |
+
for rgb in data["kwargs"]["colors"]
|
991 |
+
]
|
992 |
+
)))
|
993 |
+
|
994 |
+
cluster_div = Div(
|
995 |
+
Section(
|
996 |
+
H2("Topic Analysis"),
|
997 |
+
P("We tried to classify data into topic groups and looked for correlations between topics and statistics of data. Data from different topic groups should manifest different characteristics of distribution, which can give us some insight into the composition of dataset."),
|
998 |
+
H3("Methodology"),
|
999 |
+
P("We took the ", A("common crawl", href="https://commoncrawl.org/"), " data and clustered them into 17 topic groups using ", A("BERTopic", href="https://maartengr.github.io/BERTopic/index.html"), ". We collected and aggregated a series of metrics which include quality signals and other useful metadata. For each topic group, we calculated average scores and generated the corresponding bar charts over different metrics for comparison and analysis."),
|
1000 |
+
H3("Cluster Groups"),
|
1001 |
+
P("We grouped data into the following 17 clusters"),
|
1002 |
+
Ul(*(
|
1003 |
+
Li(topic_name, style = "margin-bottom: 5px")
|
1004 |
+
for topic_name in ("Arts", "Business & Economics & Finance", "Culture & Cultural geography", "Daily Life & Home & Lifestyle", "Education", "Entertainment & Travel & Hobby", "Environment", "Food & Drink & Cooking", "Health & Wellness & Medicine", "Law & Justice", "Natural Science & Formal Science & Technology", "Personal Development & Human Resources & Career", "Politics & Government", "Religion & Spirituality", "Shopping & Commodity", "Society & Social Issues & Human Rights", "Sports")
|
1005 |
+
)),
|
1006 |
+
H3("Results Analysis"),
|
1007 |
+
*(
|
1008 |
+
Section(H4(title), plotly2fasthtml(topic_graphs[i]), P(data.get("comment", '')))
|
1009 |
+
for i, (title, data) in enumerate(topic_charts)
|
1010 |
+
)
|
1011 |
+
)
|
1012 |
+
)
|
1013 |
+
|
1014 |
|
1015 |
def results():
|
1016 |
return Div(
|
|
|
1032 |
),
|
1033 |
Section(
|
1034 |
llama_div,
|
1035 |
+
),
|
1036 |
+
Section(
|
1037 |
+
cluster_div,
|
1038 |
+
id="section55"
|
1039 |
),
|
1040 |
id="inner-text"
|
1041 |
)
|
web.py
CHANGED
@@ -376,6 +376,7 @@ def web_data():
|
|
376 |
return Div(
|
377 |
Section(
|
378 |
Div(
|
|
|
379 |
H2("Common Crawl Snapshot Processing"),
|
380 |
H3("What This Section Contains"),
|
381 |
P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
|
@@ -387,6 +388,7 @@ def web_data():
|
|
387 |
Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
|
388 |
"DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
|
389 |
"and/or RedPajama-V-2", D_cite(bibtex_key="redpajama-v2"), style = "margin-bottom: 5px"),
|
|
|
390 |
),
|
391 |
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
392 |
),
|
|
|
376 |
return Div(
|
377 |
Section(
|
378 |
Div(
|
379 |
+
H1("Web Data Processing"),
|
380 |
H2("Common Crawl Snapshot Processing"),
|
381 |
H3("What This Section Contains"),
|
382 |
P("This section provides a complete discussion on the filtering applied to the 99 Common Crawl snapshots that comprise the web data section of TxT360. The section is split into the following topic areas: "),
|
|
|
388 |
Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
|
389 |
"DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
|
390 |
"and/or RedPajama-V-2", D_cite(bibtex_key="redpajama-v2"), style = "margin-bottom: 5px"),
|
391 |
+
Li(B("Estimated Reading Time: 31 minutes"), style = "margin-bottom: 5px"),
|
392 |
),
|
393 |
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
394 |
),
|