|
import torch |
|
from torch import nn |
|
import re |
|
import numpy as np |
|
import pandas as pd |
|
from collections import OrderedDict |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
|
if device == 'cuda:0': |
|
torch.cuda.set_device(device) |
|
print(device) |
|
|
|
def extract_text_from_link(url): |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
text = soup.get_text() |
|
return text |
|
|
|
|
|
|
|
doc = """The word "deep" in "deep learning" refers to the number of layers through which the data is transformed. More precisely, |
|
deep learning systems have a substantial credit assignment path (CAP) depth. The CAP is the chain of transformations from input to |
|
output. CAPs describe potentially causal connections between input and output. For a feedforward neural network, the depth of the |
|
CAPs is that of the network and is the number of hidden layers plus one (as the output layer is also parameterized). For recurrent |
|
neural networks, in which a signal may propagate through a layer more than once, the CAP depth is potentially unlimited.[13] No |
|
universally agreed-upon threshold of depth divides shallow learning from deep learning, but most researchers agree that deep |
|
learning involves CAP depth higher than 2. CAP of depth 2 has been shown to be a universal approximator in the sense that it |
|
can emulate any function.[14] Beyond that, more layers do not add to the function approximator ability of the network. Deep |
|
models (CAP > 2) are able to extract better features than shallow models and hence, extra layers help in learning the features |
|
effectively.""" |
|
|
|
|
|
class Text2Words: |
|
def __init__(self, document): |
|
self.text_all = re.findall(r'\b[A-Za-z]+\b', document) |
|
self.text = list(set(self.text_all)) |
|
self.chars_all = ''.join(self.text) |
|
self.chars = self.unique_chars(self.chars_all) |
|
self.int2char = dict(enumerate(self.chars)) |
|
self.char2int = {char: ind for ind, char in self.int2char.items()} |
|
self.maxlen = len(max(self.text, key=len)) |
|
self.update_text() |
|
self.input_seq_char, self.target_seq_char = self.get_seq_char(self.text) |
|
self.input_seq_index, self.target_seq_index = self.get_seq(self.char2int, self.input_seq_char, self.target_seq_char, len(self.text)) |
|
self.dict_size = len(self.char2int) |
|
self.seq_len = self.maxlen - 1 |
|
self.batch_size = len(self.text) |
|
self.input_seq = self.one_hot_encode(self.input_seq_index, self.dict_size, self.seq_len, self.batch_size) |
|
|
|
def one_hot_encode(self, sequence, dict_size, seq_len, batch_size): |
|
|
|
features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32) |
|
|
|
|
|
for i in range(batch_size): |
|
for u in range(seq_len): |
|
features[i, u, sequence[i][u]] = 1 |
|
return features |
|
|
|
def get_seq(self, char2int, input_seq_char, target_seq_char,n): |
|
x=[] |
|
y=[] |
|
for i in range(n): |
|
x.append([char2int[character] for character in input_seq_char[i]]) |
|
y.append([char2int[character] for character in target_seq_char[i]]) |
|
return x,y |
|
|
|
def get_seq_char(self, text): |
|
input_seq = [] |
|
target_seq = [] |
|
|
|
for i in range(len(text)): |
|
|
|
input_seq.append(text[i][:-1]) |
|
|
|
target_seq.append(text[i][1:]) |
|
return input_seq, target_seq |
|
|
|
def unique_chars(self, chars_all): |
|
chars = [] |
|
for letter in chars_all: |
|
if letter not in chars: |
|
chars.append(letter) |
|
|
|
if ' ' not in chars: |
|
chars.append(' ') |
|
return sorted(chars) |
|
|
|
def update_text(self): |
|
for i in range(len(self.text)): |
|
while len(self.text[i])<self.maxlen: |
|
self.text[i] += ' ' |
|
|
|
def description(self): |
|
text = {} |
|
for word in self.text: |
|
char = word[0] |
|
if char not in text: |
|
text[char] = [] |
|
text[char].append(word.strip()) |
|
for k,v in (sorted(text.items())): |
|
print(f'{k} : {sorted(v)}') |
|
|
|
def lengt_analysis(self): |
|
text = {} |
|
words = set(self.text_all) |
|
for word in words: |
|
n = len(word) |
|
if n not in text: |
|
text[n] = [] |
|
text[n].append(word.strip()) |
|
for k,v in (sorted(text.items())): |
|
print(f'{k} : count = {len(v)} list = {sorted(v)}') |
|
return None |
|
|
|
|
|
def create_object(doc): |
|
return Text2Words(doc) |
|
|
|
|
|
def get_inputs(obj): |
|
input_seq = torch.tensor(obj.input_seq, device=device) |
|
target_seq_index = torch.tensor(obj.target_seq_index, device=device) |
|
return input_seq, target_seq_index |
|
|
|
class Model(nn.Module): |
|
def __init__(self, input_size, output_size, hidden_dim, n_layers): |
|
super(Model, self).__init__() |
|
|
|
|
|
self.hidden_dim = hidden_dim |
|
self.n_layers = n_layers |
|
|
|
|
|
|
|
self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True) |
|
|
|
self.fc = nn.Linear(hidden_dim, output_size) |
|
|
|
def forward(self, x): |
|
batch_size = x.size(0) |
|
hidden = self.init_hidden(batch_size) |
|
out, hidden = self.rnn(x, hidden) |
|
out = out.contiguous().view(-1, self.hidden_dim) |
|
out = self.fc(out) |
|
return out, hidden |
|
|
|
def init_hidden(self, batch_size): |
|
|
|
torch.manual_seed(42) |
|
hidden = torch.zeros((self.n_layers, batch_size, self.hidden_dim), device=device) |
|
return hidden |
|
|
|
def create_model(obj): |
|
model = Model(input_size=obj.dict_size, output_size=obj.dict_size, hidden_dim=2*obj.dict_size, n_layers=1) |
|
model.to(device) |
|
lr=0.01 |
|
criterion = nn.CrossEntropyLoss() |
|
optimizer = torch.optim.Adam(model.parameters(), lr=lr) |
|
return model, criterion, optimizer |
|
|
|
|
|
def predict(model, character): |
|
|
|
|
|
character = np.array([[obj.char2int[c] for c in character]]) |
|
|
|
character = obj.one_hot_encode(character, obj.dict_size, character.shape[1], 1) |
|
|
|
character = torch.tensor(character, device=device) |
|
character.to(device) |
|
out, hidden = model(character) |
|
|
|
prob = nn.functional.softmax(out[-1], dim=0).data |
|
|
|
char_ind = torch.max(prob, dim=0)[1].item() |
|
|
|
return obj.int2char[char_ind], hidden |
|
|
|
|
|
def sample(model, out_len, start='h'): |
|
model.eval() |
|
chars = [ch for ch in start] |
|
char = chars[-1] |
|
chars = chars[:-1] |
|
|
|
while char != ' ': |
|
chars.append(char) |
|
char, h = predict(model, chars) |
|
return ''.join(chars) |
|
|
|
|
|
def load_checkpoint(filepath): |
|
checkpoint = torch.load(filepath) |
|
|
|
model = checkpoint['model'] |
|
|
|
model.load_state_dict(checkpoint['state_dict']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
model.eval() |
|
return model |
|
|
|
model = load_checkpoint('checkpoint.pth') |
|
|
|
sample(model, obj.maxlen, 'ap') |
|
|