victormiller commited on
Commit
b9f2fad
1 Parent(s): 2e40103

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +33 -631
curated.py CHANGED
@@ -57,638 +57,40 @@ fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hove
57
  # Display treemap if you want to update the size.update_layout(width=800, height=600)
58
  treemap_chart = fig
59
 
60
- data = {
61
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
62
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491]
63
- }
64
-
65
- # Creating a dataframe
66
- df = pd.DataFrame(data)
67
-
68
- # Creating the stacked bar chart
69
- fig = go.Figure()
70
-
71
- # Add trace for each dataset
72
- for dataset in df.columns[1:]:
73
- fig.add_trace(go.Bar(
74
- name=dataset,
75
- x=df['Filter'],
76
- y=df[dataset]
77
- ))
78
-
79
- # Update the layout
80
- fig.update_layout(
81
- barmode='group',
82
- title='Wikipedia Bar Chart of Line Reductions by Filter for Each Dataset',
83
- xaxis_title='Filter',
84
- yaxis_title='Number of Lines',
85
- legend_title='Dataset',
86
- height=600,
87
- width=1000
88
- )
89
-
90
- # Show the plot
91
- wikipedia_bar = fig
92
-
93
- data = {
94
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
95
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
96
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
97
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
98
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
99
- 'PG19': [28752, 28683, 28682, 28632],
100
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
101
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
102
- 'Europarl': [69814, 69814, 69814, 69814],
103
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
104
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
105
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
106
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
107
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
108
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
109
- 'Phil Papers': [49389, 39175, 39175, 39128]
110
- }
111
-
112
- # Creating a dataframe
113
- df = pd.DataFrame(data)
114
-
115
- # Creating the stacked bar chart
116
- fig = go.Figure()
117
-
118
- # Add trace for each dataset
119
- for dataset in df.columns[1:]:
120
- fig.add_trace(go.Bar(
121
- name=dataset,
122
- x=df['Filter'],
123
- y=df[dataset]
124
- ))
125
-
126
- # Update the layout
127
- fig.update_layout(
128
- barmode='group',
129
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
130
- xaxis_title='Filter',
131
- yaxis_title='Number of Lines',
132
- legend_title='Dataset',
133
- height=600,
134
- width=1000
135
- )
136
-
137
- # Show the plot
138
- freelaw_bar = fig
139
-
140
- data = {
141
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
142
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
143
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
144
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
145
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
146
- 'PG19': [28752, 28683, 28682, 28632],
147
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
148
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
149
- 'Europarl': [69814, 69814, 69814, 69814],
150
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
151
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
152
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
153
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
154
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
155
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
156
- 'Phil Papers': [49389, 39175, 39175, 39128]
157
- }
158
-
159
- # Creating a dataframe
160
- df = pd.DataFrame(data)
161
-
162
- # Creating the stacked bar chart
163
- fig = go.Figure()
164
-
165
- # Add trace for each dataset
166
- for dataset in df.columns[1:]:
167
- fig.add_trace(go.Bar(
168
- name=dataset,
169
- x=df['Filter'],
170
- y=df[dataset]
171
- ))
172
-
173
- # Update the layout
174
- fig.update_layout(
175
- barmode='stack',
176
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
177
- xaxis_title='Filter',
178
- yaxis_title='Number of Lines',
179
- legend_title='Dataset',
180
- height=600,
181
- width=1000
182
- )
183
-
184
- # Show the plot
185
- diff2_stacked_bar = fig
186
- data = {
187
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
188
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
189
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
190
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
191
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
192
- 'PG19': [28752, 28683, 28682, 28632],
193
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
194
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
195
- 'Europarl': [69814, 69814, 69814, 69814],
196
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
197
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
198
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
199
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
200
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
201
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
202
- 'Phil Papers': [49389, 39175, 39175, 39128]
203
- }
204
-
205
- # Creating a dataframe
206
- df = pd.DataFrame(data)
207
-
208
- # Creating the stacked bar chart
209
- fig = go.Figure()
210
-
211
- # Add trace for each dataset
212
- for dataset in df.columns[1:]:
213
- fig.add_trace(go.Bar(
214
- name=dataset,
215
- x=df['Filter'],
216
- y=df[dataset]
217
- ))
218
-
219
- # Update the layout
220
- fig.update_layout(
221
- barmode='stack',
222
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
223
- xaxis_title='Filter',
224
- yaxis_title='Number of Lines',
225
- legend_title='Dataset',
226
- height=600,
227
- width=1000
228
- )
229
-
230
- # Show the plot
231
- diff2_stacked_bar = fig
232
- data = {
233
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
234
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
235
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
236
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
237
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
238
- 'PG19': [28752, 28683, 28682, 28632],
239
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
240
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
241
- 'Europarl': [69814, 69814, 69814, 69814],
242
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
243
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
244
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
245
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
246
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
247
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
248
- 'Phil Papers': [49389, 39175, 39175, 39128]
249
- }
250
-
251
- # Creating a dataframe
252
- df = pd.DataFrame(data)
253
-
254
- # Creating the stacked bar chart
255
- fig = go.Figure()
256
-
257
- # Add trace for each dataset
258
- for dataset in df.columns[1:]:
259
- fig.add_trace(go.Bar(
260
- name=dataset,
261
- x=df['Filter'],
262
- y=df[dataset]
263
- ))
264
-
265
- # Update the layout
266
- fig.update_layout(
267
- barmode='stack',
268
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
269
- xaxis_title='Filter',
270
- yaxis_title='Number of Lines',
271
- legend_title='Dataset',
272
- height=600,
273
- width=1000
274
- )
275
-
276
- # Show the plot
277
- diff2_stacked_bar = fig
278
- data = {
279
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
280
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
281
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
282
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
283
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
284
- 'PG19': [28752, 28683, 28682, 28632],
285
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
286
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
287
- 'Europarl': [69814, 69814, 69814, 69814],
288
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
289
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
290
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
291
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
292
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
293
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
294
- 'Phil Papers': [49389, 39175, 39175, 39128]
295
- }
296
-
297
- # Creating a dataframe
298
- df = pd.DataFrame(data)
299
-
300
- # Creating the stacked bar chart
301
- fig = go.Figure()
302
-
303
- # Add trace for each dataset
304
- for dataset in df.columns[1:]:
305
- fig.add_trace(go.Bar(
306
- name=dataset,
307
- x=df['Filter'],
308
- y=df[dataset]
309
- ))
310
-
311
- # Update the layout
312
- fig.update_layout(
313
- barmode='stack',
314
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
315
- xaxis_title='Filter',
316
- yaxis_title='Number of Lines',
317
- legend_title='Dataset',
318
- height=600,
319
- width=1000
320
- )
321
-
322
- # Show the plot
323
- diff2_stacked_bar = fig
324
- data = {
325
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
326
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
327
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
328
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
329
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
330
- 'PG19': [28752, 28683, 28682, 28632],
331
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
332
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
333
- 'Europarl': [69814, 69814, 69814, 69814],
334
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
335
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
336
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
337
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
338
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
339
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
340
- 'Phil Papers': [49389, 39175, 39175, 39128]
341
- }
342
-
343
- # Creating a dataframe
344
- df = pd.DataFrame(data)
345
-
346
- # Creating the stacked bar chart
347
- fig = go.Figure()
348
-
349
- # Add trace for each dataset
350
- for dataset in df.columns[1:]:
351
- fig.add_trace(go.Bar(
352
- name=dataset,
353
- x=df['Filter'],
354
- y=df[dataset]
355
- ))
356
-
357
- # Update the layout
358
- fig.update_layout(
359
- barmode='stack',
360
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
361
- xaxis_title='Filter',
362
- yaxis_title='Number of Lines',
363
- legend_title='Dataset',
364
- height=600,
365
- width=1000
366
- )
367
-
368
- # Show the plot
369
- diff2_stacked_bar = fig
370
- data = {
371
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
372
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
373
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
374
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
375
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
376
- 'PG19': [28752, 28683, 28682, 28632],
377
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
378
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
379
- 'Europarl': [69814, 69814, 69814, 69814],
380
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
381
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
382
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
383
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
384
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
385
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
386
- 'Phil Papers': [49389, 39175, 39175, 39128]
387
- }
388
-
389
- # Creating a dataframe
390
- df = pd.DataFrame(data)
391
-
392
- # Creating the stacked bar chart
393
- fig = go.Figure()
394
-
395
- # Add trace for each dataset
396
- for dataset in df.columns[1:]:
397
- fig.add_trace(go.Bar(
398
- name=dataset,
399
- x=df['Filter'],
400
- y=df[dataset]
401
- ))
402
-
403
- # Update the layout
404
- fig.update_layout(
405
- barmode='stack',
406
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
407
- xaxis_title='Filter',
408
- yaxis_title='Number of Lines',
409
- legend_title='Dataset',
410
- height=600,
411
- width=1000
412
- )
413
-
414
- # Show the plot
415
- diff2_stacked_bar = fig
416
- data = {
417
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
418
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
419
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
420
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
421
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
422
- 'PG19': [28752, 28683, 28682, 28632],
423
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
424
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
425
- 'Europarl': [69814, 69814, 69814, 69814],
426
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
427
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
428
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
429
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
430
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
431
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
432
- 'Phil Papers': [49389, 39175, 39175, 39128]
433
- }
434
-
435
- # Creating a dataframe
436
- df = pd.DataFrame(data)
437
-
438
- # Creating the stacked bar chart
439
- fig = go.Figure()
440
-
441
- # Add trace for each dataset
442
- for dataset in df.columns[1:]:
443
- fig.add_trace(go.Bar(
444
- name=dataset,
445
- x=df['Filter'],
446
- y=df[dataset]
447
- ))
448
-
449
- # Update the layout
450
- fig.update_layout(
451
- barmode='stack',
452
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
453
- xaxis_title='Filter',
454
- yaxis_title='Number of Lines',
455
- legend_title='Dataset',
456
- height=600,
457
- width=1000
458
- )
459
-
460
- # Show the plot
461
- diff2_stacked_bar = fig
462
- data = {
463
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
464
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
465
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
466
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
467
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
468
- 'PG19': [28752, 28683, 28682, 28632],
469
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
470
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
471
- 'Europarl': [69814, 69814, 69814, 69814],
472
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
473
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
474
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
475
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
476
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
477
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
478
- 'Phil Papers': [49389, 39175, 39175, 39128]
479
- }
480
-
481
- # Creating a dataframe
482
- df = pd.DataFrame(data)
483
-
484
- # Creating the stacked bar chart
485
- fig = go.Figure()
486
-
487
- # Add trace for each dataset
488
- for dataset in df.columns[1:]:
489
- fig.add_trace(go.Bar(
490
- name=dataset,
491
- x=df['Filter'],
492
- y=df[dataset]
493
- ))
494
-
495
- # Update the layout
496
- fig.update_layout(
497
- barmode='stack',
498
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
499
- xaxis_title='Filter',
500
- yaxis_title='Number of Lines',
501
- legend_title='Dataset',
502
- height=600,
503
- width=1000
504
- )
505
-
506
- # Show the plot
507
- diff2_stacked_bar = fig
508
- data = {
509
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
510
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
511
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
512
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
513
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
514
- 'PG19': [28752, 28683, 28682, 28632],
515
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
516
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
517
- 'Europarl': [69814, 69814, 69814, 69814],
518
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
519
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
520
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
521
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
522
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
523
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
524
- 'Phil Papers': [49389, 39175, 39175, 39128]
525
- }
526
-
527
- # Creating a dataframe
528
- df = pd.DataFrame(data)
529
-
530
- # Creating the stacked bar chart
531
- fig = go.Figure()
532
-
533
- # Add trace for each dataset
534
- for dataset in df.columns[1:]:
535
- fig.add_trace(go.Bar(
536
- name=dataset,
537
- x=df['Filter'],
538
- y=df[dataset]
539
- ))
540
-
541
- # Update the layout
542
- fig.update_layout(
543
- barmode='stack',
544
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
545
- xaxis_title='Filter',
546
- yaxis_title='Number of Lines',
547
- legend_title='Dataset',
548
- height=600,
549
- width=1000
550
- )
551
-
552
- # Show the plot
553
- diff2_stacked_bar = fig
554
- data = {
555
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
556
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
557
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
558
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
559
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
560
- 'PG19': [28752, 28683, 28682, 28632],
561
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
562
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
563
- 'Europarl': [69814, 69814, 69814, 69814],
564
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
565
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
566
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
567
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
568
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
569
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
570
- 'Phil Papers': [49389, 39175, 39175, 39128]
571
- }
572
-
573
- # Creating a dataframe
574
- df = pd.DataFrame(data)
575
-
576
- # Creating the stacked bar chart
577
- fig = go.Figure()
578
-
579
- # Add trace for each dataset
580
- for dataset in df.columns[1:]:
581
- fig.add_trace(go.Bar(
582
- name=dataset,
583
- x=df['Filter'],
584
- y=df[dataset]
585
- ))
586
-
587
- # Update the layout
588
- fig.update_layout(
589
- barmode='stack',
590
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
591
- xaxis_title='Filter',
592
- yaxis_title='Number of Lines',
593
- legend_title='Dataset',
594
- height=600,
595
- width=1000
596
- )
597
-
598
- # Show the plot
599
- diff2_stacked_bar = fig
600
- data = {
601
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
602
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
603
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
604
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
605
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
606
- 'PG19': [28752, 28683, 28682, 28632],
607
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
608
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
609
- 'Europarl': [69814, 69814, 69814, 69814],
610
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
611
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
612
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
613
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
614
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
615
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
616
- 'Phil Papers': [49389, 39175, 39175, 39128]
617
- }
618
-
619
- # Creating a dataframe
620
- df = pd.DataFrame(data)
621
-
622
- # Creating the stacked bar chart
623
- fig = go.Figure()
624
-
625
- # Add trace for each dataset
626
- for dataset in df.columns[1:]:
627
- fig.add_trace(go.Bar(
628
- name=dataset,
629
- x=df['Filter'],
630
- y=df[dataset]
631
- ))
632
-
633
- # Update the layout
634
- fig.update_layout(
635
- barmode='stack',
636
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
637
- xaxis_title='Filter',
638
- yaxis_title='Number of Lines',
639
- legend_title='Dataset',
640
- height=600,
641
- width=1000
642
- )
643
-
644
- # Show the plot
645
- diff2_stacked_bar = fig
646
- data = {
647
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
648
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
649
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
650
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
651
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
652
- 'PG19': [28752, 28683, 28682, 28632],
653
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
654
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
655
- 'Europarl': [69814, 69814, 69814, 69814],
656
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
657
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
658
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
659
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
660
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
661
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
662
- 'Phil Papers': [49389, 39175, 39175, 39128]
663
- }
664
-
665
- # Creating a dataframe
666
- df = pd.DataFrame(data)
667
-
668
- # Creating the stacked bar chart
669
- fig = go.Figure()
670
-
671
- # Add trace for each dataset
672
- for dataset in df.columns[1:]:
673
- fig.add_trace(go.Bar(
674
- name=dataset,
675
- x=df['Filter'],
676
- y=df[dataset]
677
- ))
678
-
679
- # Update the layout
680
- fig.update_layout(
681
- barmode='stack',
682
- title='Stacked Bar Chart of Line Reductions by Filter for Each Dataset',
683
- xaxis_title='Filter',
684
- yaxis_title='Number of Lines',
685
- legend_title='Dataset',
686
- height=600,
687
- width=1000
688
- )
689
 
690
- # Show the plot
691
- diff2_stacked_bar = fig
692
 
693
 
694
  filtering_process = Div(
 
57
  # Display treemap if you want to update the size.update_layout(width=800, height=600)
58
  treemap_chart = fig
59
 
60
+ wikipedia_filter = pd.DataFrame(
61
+ {
62
+ "Dataset": [
63
+ "Wikipedia",
64
+ ],
65
+ "Lines Downloaded": [
66
+ "",
67
+ ],
68
+ "Lines Remaining After Language Filter": [
69
+ "",
70
+ ],
71
+ "Percent Removed": [
72
+ "0.00%",
73
+ ],
74
+ "Lines Remaining After Min Word Count Filter": [
75
+ "",
76
+ ],
77
+ "Percent Removed": [
78
+ "1.86%",
79
+ ],
80
+ "Lines Remaining After Unigram Probability Filter": [
81
+ "",
82
+ ],
83
+ "Percent Removed": [
84
+ "0.00%",
85
+ ],
86
+ "Total Percentage Remaining": [
87
+ "98.14%",
88
+ ],
89
+ }
90
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ table_html_wikipedia = dataset_comparison.to_html(index=False, border=0)
93
+ table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
94
 
95
 
96
  filtering_process = Div(