megalaa commited on
Commit
2fddad0
1 Parent(s): 7427730

Upload 13 files

Browse files
Files changed (3) hide show
  1. README.md +1 -5
  2. config.json +1 -1
  3. coptic_english_pipeline.py +151 -0
README.md CHANGED
@@ -1,7 +1,3 @@
1
  ---
2
- license: agpl-3.0
3
- language:
4
- - en
5
- - cop
6
  ---
7
-
 
1
  ---
2
+ license: mit
 
 
 
3
  ---
 
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "pt": "megalaa/mul-cop-en-norm-group-greekified"
24
  }
25
  },
26
- "impl": "__main__.CopticEnglishPipeline",
27
  "pt": [
28
  "AutoModelForSeq2SeqLM"
29
  ],
 
23
  "pt": "megalaa/mul-cop-en-norm-group-greekified"
24
  }
25
  },
26
+ "impl": "coptic_english_pipeline.CopticEnglishPipeline",
27
  "pt": [
28
  "AutoModelForSeq2SeqLM"
29
  ],
coptic_english_pipeline.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ import numpy as np
3
+ import torch
4
+ from transformers import Pipeline
5
+ from transformers.utils import ModelOutput
6
+ from transformers import pipeline
7
+ from transformers.pipelines import PIPELINE_REGISTRY
8
+ from transformers import AutoModelForSeq2SeqLM
9
+ from huggingface_hub import Repository
10
+
11
+ SAHIDIC_TAG = "з"
12
+ BOHAIRIC_TAG = "б"
13
+
14
+ from transformers import GenerationConfig
15
+
16
+ GENERATION_CONFIG = GenerationConfig(
17
+ max_length=20,
18
+ max_new_tokens=128,
19
+ min_new_tokens=1,
20
+ min_length=0,
21
+ early_stopping=True,
22
+ do_sample=True,
23
+ num_beams=5,
24
+ num_beam_groups=1,
25
+ top_k=50,
26
+ top_p=0.95,
27
+ temperature=1.0,
28
+ diversity_penalty=0.0,
29
+ output_scores=True,
30
+ return_dict_in_generate=True,
31
+ )
32
+
33
+
34
+ class CopticEnglishPipeline(Pipeline):
35
+ def _sanitize_parameters(self, **kwargs):
36
+ preprocess_kwargs = {}
37
+ if "from_bohairic" in kwargs and kwargs["from_bohairic"]:
38
+ preprocess_kwargs["from_bohairic"] = True
39
+ forward_kwargs = {}
40
+ if "output_confidence" in kwargs and kwargs["output_confidence"]:
41
+ forward_kwargs["output_confidence"] = True
42
+
43
+ return preprocess_kwargs, forward_kwargs, {}
44
+
45
+ def preprocess(self, text, from_bohairic=False):
46
+ text = greekify(text.lower())
47
+
48
+ if from_bohairic:
49
+ text = f"{BOHAIRIC_TAG} {text}"
50
+ else:
51
+ text = f"{SAHIDIC_TAG} {text}"
52
+
53
+ return self.tokenizer.encode(text, return_tensors="pt")
54
+
55
+ def _forward(self, input_tensors, output_confidence=False) -> ModelOutput:
56
+ outputs = self.model.generate(
57
+ input_tensors[:, : self.tokenizer.model_max_length],
58
+ generation_config=GENERATION_CONFIG,
59
+ )
60
+
61
+ translated_text = self.tokenizer.decode(
62
+ outputs.sequences[0], skip_special_tokens=True
63
+ )
64
+
65
+ if output_confidence:
66
+ scores = outputs.scores
67
+ confidences = [
68
+ torch.softmax(score, dim=-1).max().item() for score in scores
69
+ ]
70
+ num_words = len(translated_text.split())
71
+ # scale the predicition probability by the number of words in the sentence
72
+ scaled_probability = np.exp(sum(np.log(confidences)) / num_words)
73
+ return translated_text, scaled_probability
74
+
75
+ return translated_text, None
76
+
77
+ def postprocess(self, outputs):
78
+ text, confidence = outputs
79
+ if confidence is None:
80
+ return {
81
+ "translation": text,
82
+ }
83
+ return {
84
+ "translation": text,
85
+ "confidence": confidence,
86
+ }
87
+
88
+
89
+ COPTIC_TO_GREEK = {
90
+ "ⲁ": "α",
91
+ "ⲃ": "β",
92
+ "ⲅ": "γ",
93
+ "ⲇ": "δ",
94
+ "ⲉ": "ε",
95
+ "ⲋ": "ϛ",
96
+ "ⲍ": "ζ",
97
+ "ⲏ": "η",
98
+ "ⲑ": "θ",
99
+ "ⲓ": "ι",
100
+ "ⲕ": "κ",
101
+ "ⲗ": "λ",
102
+ "ⲙ": "μ",
103
+ "ⲛ": "ν",
104
+ "ⲝ": "ξ",
105
+ "ⲟ": "ο",
106
+ "ⲡ": "π",
107
+ "ⲣ": "ρ",
108
+ "ⲥ": "σ",
109
+ "ⲧ": "τ",
110
+ "ⲩ": "υ",
111
+ "ⲫ": "φ",
112
+ "ⲭ": "χ",
113
+ "ⲯ": "ψ",
114
+ "ⲱ": "ω",
115
+ "ϣ": "s",
116
+ "ϥ": "f",
117
+ "ϧ": "k",
118
+ "ϩ": "h",
119
+ "ϫ": "j",
120
+ "ϭ": "c",
121
+ "ϯ": "t",
122
+ }
123
+
124
+
125
+ def greekify(coptic_text):
126
+ chars = []
127
+ for c in coptic_text:
128
+ l_c = c.lower()
129
+ chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
130
+ return "".join(chars)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ PIPELINE_REGISTRY.register_pipeline(
135
+ "coptic-english-translation",
136
+ pipeline_class=CopticEnglishPipeline,
137
+ pt_model=AutoModelForSeq2SeqLM,
138
+ default={"pt": "megalaa/mul-cop-en-norm-group-greekified"},
139
+ type="text",
140
+ )
141
+
142
+ classifier = pipeline(
143
+ "coptic-english-translation", model="megalaa/mul-cop-en-norm-group-greekified"
144
+ )
145
+ print(classifier("ⲛⲧⲟϥ ⲡⲉ ⲓⲏⲥⲟⲩⲥ ⲡⲉⲭⲣⲓⲥⲧⲟⲥ", from_bohairic=False, output_confidence=True))
146
+
147
+ repo = Repository(
148
+ "cop-eng-translation",
149
+ clone_from="megalaa/mul-cop-en-norm-group-greekified",
150
+ )
151
+ classifier.save_pretrained("cop-eng-translation")