Spaces:
Sleeping
Sleeping
Initial Upload
Browse files- Dockerfile +14 -0
- dumps/model.pt +3 -0
- dumps/params.json +1 -0
- dumps/vocab.pt +3 -0
- main.py +31 -0
- requirements.txt +6 -0
- utils/model.py +106 -0
- utils/preprocess.py +19 -0
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
11 |
+
|
12 |
+
COPY . .
|
13 |
+
|
14 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
dumps/model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d5d911575429b382c886c2b9764ba3226128fe1d1368ec77bbcca6925014db1
|
3 |
+
size 4465302
|
dumps/params.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[8000, 128, 0, 64, 1, 2, [64, 64], 0.4]
|
dumps/vocab.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75ea0d5a228a81d16d25cda6f207ce443f1469431497cdcf914384d2e642907b
|
3 |
+
size 131115
|
main.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ======= PREPARING THE PIPELINE =======
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
from utils.preprocess import get_preprocess
|
6 |
+
from utils.model import get_model
|
7 |
+
|
8 |
+
dump_path = "./dumps/"
|
9 |
+
vocab_path = os.path.join(dump_path, "vocab.pt")
|
10 |
+
model_path = os.path.join(dump_path, "model.pt")
|
11 |
+
params_path = os.path.join(dump_path, "params.json")
|
12 |
+
|
13 |
+
preprocess = get_preprocess(vocab_path)
|
14 |
+
model = get_model(model_path, params_path)
|
15 |
+
|
16 |
+
def predict(text):
|
17 |
+
x = preprocess(text)
|
18 |
+
x = torch.tensor([x])
|
19 |
+
y = model(x)
|
20 |
+
y = y.detach().numpy().tolist()[0]
|
21 |
+
return y
|
22 |
+
|
23 |
+
# ======= CREATING APP =======
|
24 |
+
|
25 |
+
from fastapi import FastAPI
|
26 |
+
|
27 |
+
app = FastAPI()
|
28 |
+
|
29 |
+
@app.get("/")
|
30 |
+
def main(text: str):
|
31 |
+
return predict(text)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
torch
|
4 |
+
json
|
5 |
+
torchtext
|
6 |
+
re
|
utils/model.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import json
|
4 |
+
|
5 |
+
def attention(Q, K, V):
|
6 |
+
d = K.shape[-1]
|
7 |
+
QK = Q @ K.transpose(-2, -1)
|
8 |
+
QK_d = QK / (d ** 0.5)
|
9 |
+
weights = torch.softmax(QK_d, axis=-1)
|
10 |
+
outputs = weights @ V
|
11 |
+
return outputs
|
12 |
+
|
13 |
+
class Attention(torch.nn.Module):
|
14 |
+
def __init__(self, emb_dim, n_heads):
|
15 |
+
super(Attention, self).__init__()
|
16 |
+
|
17 |
+
self.emb_dim = emb_dim
|
18 |
+
self.n_heads = n_heads
|
19 |
+
|
20 |
+
def forward(self, X):
|
21 |
+
|
22 |
+
batch_size, seq_len, emb_dim = X.size() # (batch_size, seq_len, emb_dim)
|
23 |
+
n_heads = self.n_heads
|
24 |
+
emb_dim_per_head = emb_dim // n_heads
|
25 |
+
|
26 |
+
assert emb_dim == self.emb_dim
|
27 |
+
assert emb_dim_per_head * n_heads == emb_dim
|
28 |
+
|
29 |
+
X = X.transpose(1, 2)
|
30 |
+
output = attention(X, X, X) # (batch_size, n_heads, seq_len, emb_dim_per_head)
|
31 |
+
output = output.transpose(1, 2) # (batch_size, seq_len, n_heads, emb_dim_per_head)
|
32 |
+
output = output.contiguous().view(batch_size, seq_len, emb_dim) # (batch_size, seq_len, emb_dim)
|
33 |
+
|
34 |
+
return output
|
35 |
+
|
36 |
+
class ClassifierAttention(nn.Module):
|
37 |
+
def __init__(self, vocab_size, emb_dim, padding_idx, hidden_size, n_layers, attention_heads, hidden_layer_units, dropout):
|
38 |
+
super(ClassifierAttention, self).__init__()
|
39 |
+
|
40 |
+
self.embedding = nn.Embedding(
|
41 |
+
num_embeddings = vocab_size,
|
42 |
+
embedding_dim = emb_dim,
|
43 |
+
padding_idx = padding_idx
|
44 |
+
)
|
45 |
+
|
46 |
+
self.rnn_1 = nn.LSTM(
|
47 |
+
emb_dim,
|
48 |
+
hidden_size,
|
49 |
+
n_layers,
|
50 |
+
bidirectional = False,
|
51 |
+
batch_first = True,
|
52 |
+
)
|
53 |
+
|
54 |
+
self.attention = Attention(hidden_size, attention_heads)
|
55 |
+
|
56 |
+
self.rnn_2 = nn.LSTM(
|
57 |
+
hidden_size,
|
58 |
+
hidden_size,
|
59 |
+
n_layers,
|
60 |
+
bidirectional = False,
|
61 |
+
batch_first = True,
|
62 |
+
)
|
63 |
+
|
64 |
+
self.dropout = nn.Dropout(dropout)
|
65 |
+
|
66 |
+
hidden_layer_units = [hidden_size, *hidden_layer_units]
|
67 |
+
self.hidden_layers = nn.ModuleList([])
|
68 |
+
for in_unit, out_unit in zip(hidden_layer_units[:-1], hidden_layer_units[1:]):
|
69 |
+
self.hidden_layers.append(nn.Linear(in_unit, out_unit))
|
70 |
+
self.hidden_layers.append(nn.ReLU())
|
71 |
+
self.hidden_layers.append(self.dropout)
|
72 |
+
self.hidden_layers.append(nn.Linear(hidden_layer_units[-1], 1))
|
73 |
+
|
74 |
+
self.sigmoid = nn.Sigmoid()
|
75 |
+
|
76 |
+
def forward(self, x):
|
77 |
+
# x: (batch_size, seq_len)
|
78 |
+
|
79 |
+
out = self.embedding(x) # (batch_size, seq_len, emb_dim)
|
80 |
+
out, (hidden_state, cell_state) = self.rnn_1(out)
|
81 |
+
out = self.attention(out) # (batch_size, seq_len, emb_dim)
|
82 |
+
out = self.dropout(out)
|
83 |
+
output, (hidden_state, cell_state) = self.rnn_2(out)
|
84 |
+
out = hidden_state[-1] # (batch_size, hidden_size)
|
85 |
+
out = self.dropout(out)
|
86 |
+
# (batch_size, seq_len, hidden_dim)
|
87 |
+
# (n_layers*n_direction, batch_size, hidden_size)
|
88 |
+
# (n_layers*n_direction, batch_size, hidden_size)
|
89 |
+
|
90 |
+
for layer in self.hidden_layers:
|
91 |
+
out = layer(out)
|
92 |
+
|
93 |
+
out = self.sigmoid(out) # (batch_size, 1)
|
94 |
+
out = out.squeeze(-1) # (batch_size)
|
95 |
+
|
96 |
+
return out
|
97 |
+
|
98 |
+
def get_model(model_path, params_path):
|
99 |
+
with open(params_path, 'rb') as f:
|
100 |
+
params = json.load(f)
|
101 |
+
|
102 |
+
model = ClassifierAttention(*params)
|
103 |
+
model.load_state_dict(torch.load(model_path))
|
104 |
+
model.eval()
|
105 |
+
|
106 |
+
return model
|
utils/preprocess.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchtext
|
3 |
+
import re
|
4 |
+
|
5 |
+
def clean_text(text):
|
6 |
+
# Remove extra spaces
|
7 |
+
text = text.strip()
|
8 |
+
# Convert multiple spaces to single spaces
|
9 |
+
text = re.sub('\s+', ' ', text)
|
10 |
+
# Lowercase the text
|
11 |
+
text = text.lower()
|
12 |
+
# Remove punctuation marks
|
13 |
+
text = re.sub('[^\w\s]', '', text)
|
14 |
+
return text
|
15 |
+
|
16 |
+
def get_preprocess(vocab_path):
|
17 |
+
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
|
18 |
+
vocab = torch.load(vocab_path)
|
19 |
+
return lambda text: vocab(tokenizer(clean_text(text)))
|