Dreamsome commited on
Commit
9bb2c84
1 Parent(s): ca6dbf0

Add application file

Browse files
Files changed (2) hide show
  1. app.py +495 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import os
4
+
5
+ enable_xorbits = False
6
+
7
+
8
+ if enable_xorbits:
9
+ import xorbits.pandas as pd
10
+ import xorbits.numpy as np
11
+ import xorbits
12
+ xorbits.init(n_worker=1, n_cpu=2)
13
+ else:
14
+ import pandas as pd
15
+ import numpy as np
16
+
17
+ st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
18
+ st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
19
+ st.sidebar.markdown(
20
+ '''
21
+ This tool retrieves parquet files from Hugging Face, identifies and quantifies
22
+ junk data, duplication, contamination, and biased content in dataset using Pandas Dataframe,
23
+ and accelerates time-consuming processes using Xorbits.
24
+ '''
25
+ )
26
+
27
+ st.sidebar.header("Please Paste The HF Dataset Name Here:")
28
+
29
+ #@st.cache_data
30
+ def load_dataset(j, name, fraction):
31
+
32
+ if not os.path.exists('train.gzip'):
33
+ with st.spinner('Downloading file from remote server'):
34
+ import pandas
35
+ train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
36
+ train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
37
+ train_dataset.to_parquet('train.gzip')
38
+
39
+ if not os.path.exists('test.gzip'):
40
+ with st.spinner('Downloading file from remote server'):
41
+ import pandas
42
+ test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
43
+ test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
44
+ test_dataset.to_parquet('test.gzip')
45
+
46
+ train_dataset = pd.read_parquet('train.gzip', engine='pyarrow')
47
+
48
+ test_dataset = pd.read_parquet('test.gzip', engine='pyarrow')
49
+
50
+ dataset = {
51
+ "train": train_dataset[:int(len(train_dataset)*fraction)],
52
+ "test": test_dataset[:int(len(test_dataset)*fraction)],
53
+ }
54
+
55
+ return dataset
56
+
57
+
58
+ def get_hugging_face_dataset(name):
59
+ r = requests.get("https://datasets-server.huggingface.co/parquet?dataset=" + dataset_name)
60
+ return r.json()
61
+
62
+
63
+ dataset_name = st.sidebar.text_input('Dataset Name', 'blog_authorship_corpus')
64
+
65
+ with st.spinner('Loading meta'):
66
+ hf_datasets = get_hugging_face_dataset(dataset_name)
67
+ subsets = set([x['config'] for x in hf_datasets['parquet_files']])
68
+ subset_option = st.sidebar.selectbox("Choose a subset", subsets)
69
+ sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
70
+
71
+ tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
72
+ ["Introduction", "Junk Data🤖", "Contamination🧹", "Short Documents🌐", "Biased Content🛡️", "Duplication🔍"])
73
+ with tab0:
74
+
75
+ st.markdown(
76
+ '''
77
+ ### Why this matters?
78
+ LLMs are trained on immense datasets to have a broader understanding of language and improve
79
+ their performance.
80
+ However, the quality of the datasets can affect the performance and biases of the models.
81
+
82
+ Large datasets often have quality issues, so practitioners need to clean and preprocess
83
+ the data to remove biases, noise, and toxicity.
84
+
85
+ This tool illustrates how to analyze and quantify the quality
86
+ of any text corpus on [Hugging Face](https://huggingface.co/blog/hub-duckdb) using pandas.
87
+
88
+ ### Data Preparation
89
+ #### 1.Retrieving parquet files from Hugging Face Dataset Server
90
+ First you can get the list of the Parquet files URLs with a simple HTTP call.
91
+ ```python
92
+ r = requests.get("https://datasets-server.huggingface.co/parquet?dataset=blog_authorship_corpus")
93
+ j = r.json()
94
+ urls = [f['url'] for f in j['parquet_files'] if f['split'] == 'train']
95
+ urls
96
+ ['https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet',
97
+ 'https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00001-of-00002.parquet']
98
+ ```
99
+
100
+ #### 2.Read URLs into Pandas Dataframe
101
+
102
+ Use the pandas library to read multiple Parquet files from a list of URLs and concatenate
103
+ them into a single DataFrame:
104
+ ```python
105
+ import pandas as pd
106
+ parts = pd.read_parquet(url) for url in urls]
107
+ df = pd.concat(parts, ignore_index=True)
108
+ ```
109
+
110
+ #### 3.Addressing out-of-memory & performance issues
111
+ Since the pandas library makes use of in-memory data structures to store and operate on data,
112
+ which means that if the dataset your read from hugging face is too large to fit in memory,
113
+ it will cause an error on pandas. So we use [Xorbits](https://xorbits.io) for dealing with
114
+ larger datasets and use my laptop's cpu more efficiently.
115
+
116
+
117
+ The use of Xorbits is as simple as:
118
+
119
+ ```python
120
+ import xorbits.pandas as pd
121
+ import xorbits.numpy as np
122
+ ```
123
+
124
+ ---
125
+ '''
126
+ )
127
+ with st.expander("View raw data"):
128
+ with st.spinner("Loading..."):
129
+ datasets = load_dataset(hf_datasets, subset_option, sample_rate_option)
130
+
131
+ train, test = st.tabs([
132
+ "Train (%d rows)" % len(datasets['train']),
133
+ "Test (%d rows)" % len(datasets['test'])
134
+ ])
135
+
136
+ train.dataframe(datasets['train'][:20])
137
+ test.dataframe(datasets['test'][:20])
138
+
139
+ with tab1:
140
+ st.header("Junk Data")
141
+
142
+
143
+ st.markdown('''
144
+ Large-scale datasets often contain an uneven distribution of text representation, which includes
145
+ a significant amount of nonsensical and boilerplate text - such as HTML tags.
146
+
147
+ The presence of such "noise" or irrelevant content in the dataset is detrimental to the
148
+ training of predictive models, specifically those that operate by predicting the next token based on all previous ones.
149
+ Therefore, it's crucial to clean the dataset and remove these undesired elements prior to the training phase.
150
+
151
+ This piece of Python code calculated a measure of "impurity" in text documents, and then computing
152
+ the proportion of documents that exceed a certain impurity threshold. It defines a compiled regular expression that matches
153
+ any of the following suspicious characters: `&, #, <, >, {, }, [, ]`.
154
+ ''')
155
+
156
+
157
+ metrics, code = st.tabs(['Metrics', 'Code'])
158
+
159
+ with metrics:
160
+
161
+ with st.spinner('Calculating impurity ratio...'):
162
+ df = datasets['train']
163
+
164
+ import re
165
+ RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
166
+
167
+ def impurity(text, min_len=10):
168
+ """returns the share of suspicious characters in a text"""
169
+ if text == None or len(text) < min_len:
170
+ return 0
171
+ else:
172
+ return len(RE_SUSPICIOUS.findall(text))/len(text)
173
+
174
+ df['impurity'] = df['text'].apply(impurity, min_len=10)
175
+ total_num_docs = len(df)
176
+ impurity_num_docs = len(df[df['impurity'] > 0.01])
177
+ impurity_ratio = impurity_num_docs / total_num_docs
178
+
179
+ col1, col2, col3 = st.columns(3)
180
+ col1.metric(label="Junk Doc Count", value="%d" % impurity_num_docs)
181
+ col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
182
+ col3.metric(label="Junk Doc Ratio", value="%.2f%%" % (impurity_ratio * 100))
183
+
184
+ st.dataframe(df[['text', 'impurity']].sort_values(by='impurity', ascending=False)[:20])
185
+ with code:
186
+ st.code(
187
+ '''
188
+ import re
189
+
190
+ RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
191
+
192
+ def impurity(text, min_len=10):
193
+ """returns the share of suspicious characters in a text"""
194
+ if text == None or len(text) < min_len:
195
+ return 0
196
+ else:
197
+ return len(RE_SUSPICIOUS.findall(text))/len(text)
198
+
199
+
200
+ df['impurity'] = df['text'].apply(impurity, min_len=10)
201
+ total_num_docs = len(df)
202
+ impurity_num_docs = len(df[df['impurity'] > 0.001])
203
+ impurity_ratio = impurity_num_docs / total_num_docs
204
+ '''
205
+ )
206
+
207
+
208
+ with tab2:
209
+ st.header('Contamination')
210
+
211
+ st.markdown('''
212
+ Typically, ensuring the segregation of training and testing data is rather straightforward in machine learning.
213
+ However, things become complicated in the context of large language models
214
+ where both the training and benchmarking datasets are collected from the internet.
215
+
216
+ For instance, the performance evaluation of a large language model using benchmark data
217
+ (like question-answer pairs) can be significantly affected if the benchmark data also features
218
+ in the model's training set. The procedure of eliminating instances from the training datasets that intersect with
219
+ the existing benchmarking datasets is called "decontamination".
220
+
221
+
222
+ This Python code below is being used to quantify the contamination problem lying in the datasets,
223
+ i.e., the proportion of documents in the test set that also appear in the training set using N-grams.
224
+
225
+ The approach here is from GPT-3 paper. OpenAI defined a test document as contaminated
226
+ if any N-gram overlap existed with any training document.
227
+ (They used a range of N values between 8 and 13 depending on dataset.)
228
+ When constructing the WebText dataset, OpenAI researchers decontaminated the data by
229
+ eliminating all Wikipedia content from the training set. This was necessary as Wikipedia
230
+ data was heavily used in their benchmark datasets.
231
+ ''')
232
+
233
+ metrics, code = st.tabs(['Metrics', 'Code'])
234
+ with metrics:
235
+
236
+ with st.spinner('Calculating contamination ratio...'):
237
+
238
+ train_dataset = datasets['train']
239
+ test_dataset = datasets['test']
240
+ from nltk import ngrams
241
+ def generate_ngrams(text, n=8):
242
+ return set(ngrams(text.split(), n))
243
+
244
+ train_dataset['ngrams'] = train_dataset['text'].apply(generate_ngrams)
245
+ test_dataset['ngrams'] = test_dataset['text'].apply(generate_ngrams)
246
+
247
+ # Creating a set of n-grams in the train set
248
+ train_ngrams = set.union(*train_dataset['ngrams'])
249
+
250
+ # Creating a boolean mask marking documents in the test set that have appeared in the train set
251
+ common_docs = test_dataset['ngrams'].apply(lambda x: not x.isdisjoint(train_ngrams))
252
+ common_docs_count = common_docs.sum()
253
+
254
+ train_dataset_count = len(train_dataset)
255
+ test_dataset_count = len(test_dataset)
256
+ contaminate_ratio = common_docs_count / test_dataset_count
257
+
258
+ col1, col2, col3, col4 = st.columns(4)
259
+ col1.metric(label="Train Set Size", value="%d" % train_dataset_count)
260
+ col2.metric(label="Test Set Size", value="%d" % test_dataset_count)
261
+ col3.metric(label="Overlapped Docs", value="%d" % common_docs_count)
262
+ col4.metric(label="Contaminated Ratio", value="%.2f%%" % (contaminate_ratio * 100))
263
+ with code:
264
+ st.code(
265
+ '''
266
+ from nltk import ngrams
267
+ def generate_ngrams(text, n=8):
268
+ return set(ngrams(text.split(), n))
269
+
270
+ train_dataset['ngrams'] = train_dataset['text'].apply(generate_ngrams)
271
+ test_dataset['ngrams'] = test_dataset['text'].apply(generate_ngrams)
272
+
273
+ # Creating a set of n-grams in the train set
274
+ train_ngrams = set.union(*train_dataset['ngrams'])
275
+
276
+ # Creating a boolean mask marking documents in the test set that have appeared in the train set
277
+ common_docs = test_dataset['ngrams'].apply(lambda x: not x.isdisjoint(train_ngrams))
278
+ common_docs_count = common_docs.sum()
279
+
280
+ train_dataset_count = len(train_dataset)
281
+ test_dataset_count = len(test_dataset)
282
+ contaminate_ratio = common_docs / test_dataset_count
283
+ '''
284
+ )
285
+
286
+ with tab3:
287
+ st.header("Too-Short Documents")
288
+
289
+ st.markdown('''
290
+ The aim of language modeling is to master the generation of text based on preceding tokens.
291
+ In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
292
+ 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
293
+ model dependencies within the text.
294
+
295
+
296
+ Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
297
+ of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
298
+ model can understand. Choose a tokenizer for your model.
299
+ ''')
300
+ metrics, code = st.tabs(['Metrics', 'Code'])
301
+
302
+ with metrics:
303
+ with st.spinner('Calculating too-short ratio...'):
304
+ from transformers import BertTokenizer
305
+
306
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
307
+
308
+ df = datasets['train']
309
+ # Create a new column with the number of tokens for each text
310
+ df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
311
+ total_num_docs = len(df)
312
+ too_short_docs = len(df[df['text_length'] < 100])
313
+ too_short_doc_ratio = too_short_docs / total_num_docs
314
+
315
+ col1, col2, col3 = st.columns(3)
316
+ col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
317
+ col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
318
+ col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
319
+
320
+ # col1, _ = st.columns([2, 1])
321
+
322
+ # import seaborn as sns
323
+ # import matplotlib.pyplot as plt
324
+ # fig, ax = plt.subplots(figsize=(10, 5))
325
+ # ax.set_title('Distribution of text length (in tokens)')
326
+ # sns.histplot(data=df, x='text_length', ax=ax)
327
+ # plt.axvline(100, color='r', linestyle='--')
328
+ # col1.pyplot(fig)
329
+ with code:
330
+ st.code(
331
+ '''
332
+ from transformers import BertTokenizer
333
+
334
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
335
+
336
+ df = datasets['train']
337
+ # Create a new column with the number of tokens for each text
338
+ df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
339
+ total_num_docs = len(df)
340
+ too_short_docs = len(df[df['text_length'] < 100])
341
+ too_short_doc_ratio = too_short_docs / total_num_docs
342
+ '''
343
+ )
344
+
345
+ with tab4:
346
+ st.header('Toxic Content')
347
+ st.markdown('''
348
+ It is crucial in the training of language models to be vigilant and potentially apply tools
349
+ to exclude toxic content from the pre-training datasets. This practice helps to
350
+ prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
351
+
352
+ One approach to address this issue is by scanning the text for **offensive words**.
353
+ For instance, the creators of the C4 dataset have implemented such a
354
+ filtering mechanism. The follow code references this
355
+ [word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
356
+
357
+ The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
358
+
359
+ ''')
360
+
361
+ metrics, code = st.tabs(['Metrics', 'Code'])
362
+ with metrics:
363
+ with st.spinner('Calculating toxic ratio...'):
364
+ df = datasets['train']
365
+
366
+ with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
367
+ lines = f.readlines()
368
+
369
+ banned_words = [line.rstrip('\n') for line in lines]
370
+ df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
371
+ df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
372
+ total_num_docs = len(df)
373
+ biased_num_docs = df['matches'].sum()
374
+ biased_content_ratio = biased_num_docs / total_num_docs
375
+ col1, col2, col3 = st.columns(3)
376
+
377
+ col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
378
+ col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
379
+ col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
380
+ st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
381
+ with code:
382
+ st.code(
383
+ '''
384
+ with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
385
+ lines = f.readlines()
386
+
387
+ banned_words = [line.rstrip('\n') for line in lines]
388
+ df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
389
+ total_num_docs = len(df)
390
+ df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
391
+ biased_num_docs = df['matches'].sum()
392
+ biased_content_ratio = biased_num_docs / total_num_docs
393
+ '''
394
+ )
395
+
396
+
397
+
398
+ with tab5:
399
+ st.header("Duplication")
400
+
401
+ st.markdown(
402
+ '''
403
+ When datasets are created by scraping raw text from the Internet, this will often result
404
+ in the same sequences being repeated multiple times. [This paper](https://arxiv.org/abs/2107.06499) mentions a single 50 word sequence that is
405
+ repeated in the C4 dataset 60,000 times.
406
+
407
+ Deduplication helps prevent models from outputting verbatim training data when
408
+ there are many duplicates, and makes models less vulnerable to privacy attacks.
409
+ Deduplication can also improve model training efficiency and prevent benchmark contamination.
410
+
411
+ ### Tools & Tutorials
412
+
413
+ The [GPT-3](https://arxiv.org/abs/2005.14165) paper mentions they fuzzily deduplicated documents
414
+ within each dataset using Spark’s MinHashLSH implementation with 10 hashes.
415
+
416
+ [deduplicate-text-datasets](https://github.com/google-research/deduplicate-text-datasets)
417
+ is an ExactSubstr deduplication implementation (written in Rust) along with the scripts to
418
+ perform ExactSubstr deduplication and inspect the results (written in Python).
419
+
420
+ [datasketch](https://github.com/ekzhu/datasketch) gives you probabilistic data structures that
421
+ can process and search very large amount of data super fast, with little loss of accuracy.
422
+
423
+ [This article](https://huggingface.co/blog/dedup) provides a MinHash walkthrough to demonstrate
424
+ how to implement a parallelel deduplication.
425
+
426
+ The following code uses the [datasketch](https://github.com/ekzhu/datasketch) library and LSH (Locality Sensitive Hashing)
427
+ to deduplicate the dataset. For each text in the DataFrame, it creates a query MinHash object
428
+ and performs a query on the LSH index to find similar documents.
429
+
430
+ It worths to mention that the de-duplication process usually requires a lot of computational resources
431
+ (CPU and RAM) due to the size of web crawl datasets and it's therefore recommended to run such
432
+ computations in distributed settings.
433
+ '''
434
+ )
435
+
436
+
437
+ metrics, code = st.tabs(['Metrics', 'Code'])
438
+ with metrics:
439
+ with st.spinner('Calculating duplication ratio...'):
440
+ df = datasets['train']
441
+
442
+ from datasketch import MinHashLSH, MinHash
443
+
444
+ lsh = MinHashLSH(threshold=0.85, num_perm=128)
445
+
446
+ for i, text in enumerate(df['text']):
447
+ minhash = MinHash(num_perm=128)
448
+ for word in text.split():
449
+ minhash.update(word.encode('utf-8'))
450
+ lsh.insert(str(i), minhash)
451
+
452
+ unique_documents = set()
453
+
454
+ for i, text in enumerate(df['text']):
455
+ query_minhash = MinHash(num_perm=128)
456
+ for word in text.split():
457
+ query_minhash.update(word.encode('utf-8'))
458
+ results = lsh.query(query_minhash)
459
+ unique_documents.add(results[0])
460
+
461
+ total_unique_documents = len(unique_documents)
462
+ total_documents = len(df)
463
+ duplication_ratio = (total_documents - total_unique_documents) / total_documents
464
+
465
+ col1, col2, col3 = st.columns(3)
466
+ col2.metric(label="Total Documents", value="%d" % total_documents)
467
+ col1.metric(label="Unique Docs Pairs", value="%d" % total_unique_documents)
468
+ col3.metric(label="Duplication Ratio", value="%.2f%%" % (duplication_ratio * 100))
469
+ with code:
470
+ st.code(
471
+ '''
472
+ from datasketch import MinHashLSH, MinHash
473
+
474
+ lsh = MinHashLSH(threshold=0.85, num_perm=128)
475
+
476
+ for i, text in enumerate(df['text']):
477
+ minhash = MinHash(num_perm=128)
478
+ for word in text.split():
479
+ minhash.update(word.encode('utf-8'))
480
+ lsh.insert(str(i), minhash)
481
+
482
+ unique_documents = set()
483
+
484
+ for i, text in enumerate(df['text']):
485
+ query_minhash = MinHash(num_perm=128)
486
+ for word in text.split():
487
+ query_minhash.update(word.encode('utf-8'))
488
+ results = lsh.query(query_minhash)
489
+ unique_documents.add(results[0])
490
+
491
+ total_unique_documents = len(unique_documents)
492
+ total_documents = len(df)
493
+ duplication_ratio = (total_documents - total_unique_documents) / total_documents
494
+ '''
495
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ xorbits
4
+ matplotlib
5
+ datasketch
6
+ nltk
7
+ transformers
8
+ streamlit