DD0101 commited on
Commit
df81629
0 Parent(s):

Duplicate from DD0101/Disfluency-base

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ jdk-8u361-linux-aarch64.rpm filter=lfs diff=lfs merge=lfs -text
36
+ VnCoreNLP-1.2.jar filter=lfs diff=lfs merge=lfs -text
37
+ models/postagger/vi-tagger filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Disfluency Base
3
+ emoji: 😻
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.23.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: DD0101/Disfluency-base
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
VnCoreNLP-1.2.jar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e2811cdbc2ddfc71d04be5dc36e185c88dcd1ad4d5d69e4ff2e1369dccf7793
3
+ size 27412703
VnCoreNLP-master.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d8e31f5b5293c18691ea0d23eb64e504838222d71f020a84496fa8b5c64ec7f
3
+ size 168648997
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import transformers
4
+ from transformers import pipeline
5
+ from transformers.pipelines.token_classification import TokenClassificationPipeline
6
+ import py_vncorenlp
7
+
8
+ os.system('pwd')
9
+ os.system('sudo update-alternatives --config java')
10
+ os.mkdir('/home/user/app/vncorenlp')
11
+ py_vncorenlp.download_model(save_dir='/home/user/app/vncorenlp')
12
+ rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/home/user/app/vncorenlp')
13
+
14
+ class MyPipeline(TokenClassificationPipeline):
15
+ def preprocess(self, sentence, offset_mapping=None):
16
+ truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
17
+
18
+ model_inputs = self.tokenizer(
19
+ sentence,
20
+ return_tensors=self.framework,
21
+ truncation=truncation,
22
+ return_special_tokens_mask=True,
23
+ return_offsets_mapping=self.tokenizer.is_fast,
24
+ )
25
+
26
+
27
+ length = len(model_inputs['input_ids'][0]) - 2
28
+ tokens = self.tokenizer.tokenize(sentence)
29
+ seek = 0
30
+ offset_mapping_list = [[(0, 0)]]
31
+ for i in range(length):
32
+ if tokens[i][-2:] == '@@':
33
+ offset_mapping_list[0].append((seek, seek + len(tokens[i]) - 2))
34
+ seek += len(tokens[i]) - 2
35
+ else:
36
+ offset_mapping_list[0].append((seek, seek + len(tokens[i])))
37
+ seek += len(tokens[i]) + 1
38
+ offset_mapping_list[0].append((0, 0))
39
+
40
+ # if offset_mapping:
41
+ # model_inputs["offset_mapping"] = offset_mapping
42
+
43
+ model_inputs['offset_mapping'] = offset_mapping_list
44
+ model_inputs["sentence"] = sentence
45
+
46
+ return model_inputs
47
+
48
+ model_checkpoint = "DD0101/disfluency-base"
49
+
50
+ my_classifier = pipeline(
51
+ "token-classification", model=model_checkpoint, aggregation_strategy="simple", pipeline_class=MyPipeline)
52
+
53
+
54
+ import gradio as gr
55
+
56
+ def ner(text):
57
+ text = " ".join(rdrsegmenter.word_segment(text))
58
+
59
+ output = my_classifier(text)
60
+ for entity in output:
61
+ entity['entity'] = entity.pop('entity_group')
62
+
63
+ return {'text': text, 'entities': output}, text
64
+
65
+ examples = ['Tôi cần thuê à tôi muốn bay một chuyến khứ hồi từ Đà Nẵng đến Đà Lạt',
66
+ 'Giá vé một chiều à không khứ hồi từ Đà Nẵng đến Vinh dưới 2 triệu đồng giá vé khứ hồi từ Quy Nhơn đến Vinh dưới 3 triệu đồng giá vé khứ hồi từ Buôn Ma Thuột đến Quy Nhơn à đến Vinh dưới 4 triệu rưỡi',
67
+ 'Cho tôi biết các chuyến bay đến Đà Nẵng vào ngày 12 mà không ngày 14 tháng sáu',
68
+ 'Những chuyến bay nào khởi hành từ Thành phố Hồ Chí Minh bay đến Frankfurt mà nối chuyến ở Singapore và hạ cánh trước 10 giờ ý tôi là 9 giờ tối'
69
+ ]
70
+
71
+ demo = gr.Interface(ner,
72
+ gr.Textbox(label='Text', placeholder="Enter sentence here..."),
73
+ outputs=[gr.HighlightedText(label='Highlighted Output'), gr.Textbox(label='Word-Segmentation Preprocessing')],
74
+ examples=examples,
75
+ title="Disfluency Detection",
76
+ description="This is an easy-to-use built in Gradio for desmontrating a NER System that identifies disfluency-entities in \
77
+ Vietnamese utterances",
78
+ theme=gr.themes.Soft())
79
+
80
+ demo.launch()
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
jdk-8u361-linux-aarch64.rpm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316f980932185e2cf38797783bc38f9cf095cb125d11cf37bda91e30aed4ecce
3
+ size 73402781
models/dep/vi-dep.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266e4a3a55d5edd1607d5f036c2f95b70c0a6c80f58b57fd9962677a6ef331b7
3
+ size 16048864
models/ner/vi-500brownclusters.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d30f9cfdf0af193a69e185d1acda0306a9fbe1321f8a700f7c66557a90f92b8c
3
+ size 5599844
models/ner/vi-ner.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f04c5e752d7f99a6313b758fc2607a2c3906e58b1d60a37eb0192aead73d61f7
3
+ size 9956876
models/ner/vi-pretrainedembeddings.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d3d034f1b23a8bfe5168195741fde845808c212e6dfcd4c94bead1665eb0fc
3
+ size 57313672
models/postagger/vi-tagger ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a95608a5982db89c11353b451154ec396eccc0ff1f5b22874935ecdf4e0ace01
3
+ size 29709468
models/wordsegmenter/vi-vocab ADDED
Binary file (527 kB). View file
 
models/wordsegmenter/wordsegmenter.rdr ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ default-jdk
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ py_vncorenlp