Spaces:

khaled5321
/

ARtoEN

Runtime error

App Files Files Community

khaled5321 commited on Apr 9, 2023

Commit

f45a763

•

1 Parent(s): 155c20a

Add application file

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +170 -0
arabic_vocab.pth +3 -0
english_vocab.pth +3 -0
model.pt +3 -0
requirements.txt +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /venv

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import gradio as gr
+import random
+import re
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import unicodedata
+import nltk
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+nltk.download('punkt')
+class Encoder(nn.Module):
+    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
+        super().__init__()
+        self.embedding = nn.Embedding(input_dim, emb_dim)
+        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
+        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, src):
+        embedded = self.dropout(self.embedding(src))
+        outputs, hidden = self.rnn(embedded)
+        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
+        return outputs, hidden
+class Attention(nn.Module):
+  def __init__(self, enc_hid_dim, dec_hid_dim):
+        super().__init__()
+        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
+        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
+  def forward(self, hidden, encoder_outputs):
+    batch_size = encoder_outputs.shape[1]
+    src_len = encoder_outputs.shape[0]
+    hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
+    encoder_outputs = encoder_outputs.permute(1, 0, 2)
+    energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))
+    attention = self.v(energy).squeeze(2)
+    return F.softmax(attention, dim=1)
+class Decoder(nn.Module):
+    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
+        super().__init__()
+        self.output_dim = output_dim
+        self.attention = attention
+        self.embedding = nn.Embedding(output_dim, emb_dim)
+        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
+        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input, hidden, encoder_outputs):
+        input = input.unsqueeze(0)
+        embedded = self.dropout(self.embedding(input))
+        a = self.attention(hidden, encoder_outputs)
+        a = a.unsqueeze(1)
+        encoder_outputs = encoder_outputs.permute(1, 0, 2)
+        weighted = torch.bmm(a, encoder_outputs)
+        weighted = weighted.permute(1, 0, 2)
+        rnn_input = torch.cat((embedded, weighted), dim = 2)
+        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
+        assert (output == hidden).all()
+        embedded = embedded.squeeze(0)
+        output = output.squeeze(0)
+        weighted = weighted.squeeze(0)
+        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
+        return prediction, hidden.squeeze(0)
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder, device):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.device = device
+    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
+        batch_size = trg.shape[1]
+        trg_len = trg.shape[0]
+        trg_vocab_size = self.decoder.output_dim
+        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
+        encoder_outputs, hidden = self.encoder(src)
+        input = trg[0,:]
+        for t in range(1, trg_len):
+            output, hidden = self.decoder(input, hidden, encoder_outputs)
+            outputs[t] = output
+            teacher_force = random.random() < teacher_forcing_ratio
+            top1 = output.argmax(1)
+            input = trg[t] if teacher_force else top1
+        return outputs
+def unicodeToAscii(s):
+    return ''.join(
+        c for c in unicodedata.normalize('NFD', s)
+        if unicodedata.category(c) != 'Mn'
+    )
+def tokenize_ar(text):
+    """
+    Tokenizes Arabic text from a string into a list of strings (tokens) and reverses it
+    """
+    return [tok for tok in nltk.tokenize.wordpunct_tokenize(unicodeToAscii(text))]
+src_vocab = torch.load("arabic_vocab.pth")
+trg_vocab = torch.load("english_vocab.pth")
+INPUT_DIM = 9790
+OUTPUT_DIM = 5682
+ENC_EMB_DIM = 256
+DEC_EMB_DIM = 256
+ENC_HID_DIM = 512
+DEC_HID_DIM = 512
+ENC_DROPOUT = 0.5
+DEC_DROPOUT = 0.5
+attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
+enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
+dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)
+model = Seq2Seq(enc, dec, "cpu")
+model.load_state_dict(torch.load('model.pt', map_location=torch.device('cpu')))
+def infer(text, max_length=50):
+    text = tokenize_ar(text)
+    sequence = []
+    sequence.append(src_vocab['<sos>'])
+    sequence.extend([src_vocab[token] for token in text])
+    sequence.append(src_vocab['<eos>'])
+    sequence = torch.Tensor(sequence)
+    sequence = sequence[:, None].to(torch.int64)
+    target = torch.zeros(max_length, 1).to(torch.int64)
+    with torch.no_grad():
+        model.eval()
+        output = model(sequence, target, 0)
+        output_dim = output.shape[-1]
+        output = output[1:].view(-1, output_dim)
+    prediction = []
+    for i in output:
+        prediction.append(torch.argmax(i).item())
+    tokens = trg_vocab.lookup_tokens(prediction)
+    en = TreebankWordDetokenizer().detokenize(tokens).replace('<eos>', "")
+    return re.sub(r'[^\w\s]','',en).strip()
+iface = gr.Interface(fn=infer, inputs="text", outputs="text")
+iface.launch()

arabic_vocab.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be6f7c887496c6d29ce95ce3a7a8946924164e59bbfc2d6172e147304d182525
+size 197049

english_vocab.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79987ed2b3823a5bfc0715b460e58427a98aea877fe2fb8e6d7eab896c620c2e
+size 93115

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58170b33f60b6aae11efed74ae6f46f8de6c8909a1f17abbc3c3089415101144
+size 82332843

requirements.txt ADDED Viewed

Binary file (2.6 kB). View file