gpt2-nepali / app.py
Aananda-Giri
changed tokenizer
f46b79b
# app.py (second app by claude)
import gradio as gr
import torch
from transformers import PreTrainedTokenizerFast
from pathlib import Path
import sys
# sys.path.append(str(Path('gpt_model_code.py').resolve()))
# from gpt_model_code import load_model_n_tokenizer, generate
# ==================================================================================-
# ==================================================================================-
# ==================================================================================-
# ==================================================================================-
# ==================================================================================-
# could not make importing from gpt_model_code.py work, so i copied the code here
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch
#
# This file collects all the relevant code that we covered thus far
# throughout Chapters 2-5.
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin
from transformers import PreTrainedTokenizerFast
GPT_CONFIG_124M = {
"vocab_size": 50000, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-key-value bias
}
#####################################
# Chapter 3
#####################################
class MultiHeadAttention(nn.Module):
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
super().__init__()
assert d_out % num_heads == 0, "d_out must be divisible by n_heads"
self.d_out = d_out
self.num_heads = num_heads
self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
self.dropout = nn.Dropout(dropout)
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
def forward(self, x):
b, num_tokens, d_in = x.shape
keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
queries = self.W_query(x)
values = self.W_value(x)
# We implicitly split the matrix by adding a `num_heads` dimension
# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
values = values.view(b, num_tokens, self.num_heads, self.head_dim)
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
keys = keys.transpose(1, 2)
queries = queries.transpose(1, 2)
values = values.transpose(1, 2)
# Compute scaled dot-product attention (aka self-attention) with a causal mask
attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
# Original mask truncated to the number of tokens and converted to boolean
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
# Use the mask to fill attention scores
attn_scores.masked_fill_(mask_bool, -torch.inf)
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
attn_weights = self.dropout(attn_weights)
# Shape: (b, num_tokens, num_heads, head_dim)
context_vec = (attn_weights @ values).transpose(1, 2)
# Combine heads, where self.d_out = self.num_heads * self.head_dim
context_vec = context_vec.reshape(b, num_tokens, self.d_out)
context_vec = self.out_proj(context_vec) # optional projection
return context_vec
#####################################
# Chapter 4
#####################################
class LayerNorm(nn.Module):
def __init__(self, emb_dim):
super().__init__()
self.eps = 1e-5
self.scale = nn.Parameter(torch.ones(emb_dim))
self.shift = nn.Parameter(torch.zeros(emb_dim))
def forward(self, x):
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True, unbiased=False)
norm_x = (x - mean) / torch.sqrt(var + self.eps)
return self.scale * norm_x + self.shift
class GELU(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return 0.5 * x * (1 + torch.tanh(
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
(x + 0.044715 * torch.pow(x, 3))
))
class FeedForward(nn.Module):
def __init__(self, cfg):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
)
def forward(self, x):
return self.layers(x)
class TransformerBlock(nn.Module):
def __init__(self, cfg):
super().__init__()
self.att = MultiHeadAttention(
d_in=cfg["emb_dim"],
d_out=cfg["emb_dim"],
context_length=cfg["context_length"],
num_heads=cfg["n_heads"],
dropout=cfg["drop_rate"],
qkv_bias=cfg["qkv_bias"])
self.ff = FeedForward(cfg)
self.norm1 = LayerNorm(cfg["emb_dim"])
self.norm2 = LayerNorm(cfg["emb_dim"])
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
def forward(self, x):
# Shortcut connection for attention block
shortcut = x
x = self.norm1(x)
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
x = self.drop_shortcut(x)
x = x + shortcut # Add the original input back
# Shortcut connection for feed-forward block
shortcut = x
x = self.norm2(x)
x = self.ff(x)
x = self.drop_shortcut(x)
x = x + shortcut # Add the original input back
return x
class GPTModel(nn.Module,
PyTorchModelHubMixin, # modified to push the model to the hub (https://huggingface.co/docs/hub/en/models-uploading#upload-a-pytorch-model-using-huggingfacehub)
repo_url="https://huggingface.co/Aananda-giri/GPT2-Nepali/",
pipeline_tag="text-generation",
):
def __init__(self, cfg):
super().__init__()
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
self.drop_emb = nn.Dropout(cfg["drop_rate"])
self.trf_blocks = nn.Sequential(
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
self.final_norm = LayerNorm(cfg["emb_dim"])
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
def forward(self, in_idx):
batch_size, seq_len = in_idx.shape
tok_embeds = self.tok_emb(in_idx)
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
x = self.drop_emb(x)
x = self.trf_blocks(x)
x = self.final_norm(x)
logits = self.out_head(x)
return logits
#####################################
# Chapter 5
#####################################
def text_to_token_ids(text, tokenizer):
encoded = tokenizer.encode(text)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
return encoded_tensor
def token_ids_to_text(token_ids, tokenizer):
flat = token_ids.squeeze(0) # remove batch dimension
return tokenizer.decode(flat.tolist())
def load_model_n_tokenizer():
model = GPTModel.from_pretrained("Aananda-giri/GPT2-Nepali")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/GPT2-Nepali")
return model, tokenizer
def generate(
model,
prompt,
tokenizer,
max_new_tokens,
temperature=0.7,
top_k=50,
top_p=None, # New parameter for nucleus sampling
eos_id=None,
repetition_penalty=1.2,
penalize_len_below=50
):
context_size = GPT_CONFIG_124M['context_length']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
idx = text_to_token_ids(prompt, tokenizer).to(device)
if not eos_id:
encoded_endoftext = tokenizer.encode("<|endoftext|>")
eos_id = encoded_endoftext[0] if encoded_endoftext else None
token_freq = {}
for step in range(max_new_tokens):
idx_cond = idx[:, -context_size:]
with torch.no_grad():
logits = model(idx_cond)
logits = logits[:, -1, :]
# Apply repetition penalty
for token_id in idx[0].tolist():
if token_id in token_freq:
logits[0, token_id] /= repetition_penalty
else:
token_freq[token_id] = 1
# Penalize EOT token for shorter sequences
if eos_id is not None and step < penalize_len_below:
logits[0, eos_id] /= (penalize_len_below - step) / penalize_len_below
# Apply temperature scaling
if temperature > 0.0:
logits = logits / temperature
# Convert logits to probabilities
probs = torch.softmax(logits, dim=-1)
# Apply top-p (nucleus) sampling if specified
if top_p:
sorted_probs, sorted_indices = torch.sort(probs, descending=True)
cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
# Remove tokens with cumulative probability above the threshold
sorted_indices_to_remove = cumulative_probs > top_p
# Shift the indices to the right to keep also the first token above the threshold
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
# Create a mask for indices to remove
indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
probs = probs.masked_fill(indices_to_remove, 0.0)
# Renormalize probabilities
probs = probs / probs.sum(dim=-1, keepdim=True)
# If top_p is None, apply top-k sampling
elif top_k:
top_probs, top_indices = torch.topk(probs, top_k)
probs = torch.zeros_like(probs).scatter_(-1, top_indices, top_probs)
# Renormalize probabilities
probs = probs / probs.sum(dim=-1, keepdim=True)
# Sample from the filtered distribution
if temperature > 0.0:
idx_next = torch.multinomial(probs, num_samples=1)
else:
idx_next = torch.argmax(probs, dim=-1, keepdim=True)
if idx_next == eos_id:
break
idx = torch.cat((idx, idx_next), dim=1)
text = token_ids_to_text(idx, tokenizer)
return text
# ==================================================================================-
# ==================================================================================-
# ==================================================================================-
# ==================================================================================-
# ==================================================================================-
# Load model and tokenizer once at startup
model, tokenizer = load_model_n_tokenizer()
model.eval()
import sys
def generate_text(prompt, max_new_tokens, top_k, top_p, temperature, repetition_penalty, penalize_len_below):
device = next(model.parameters()).device
# Convert top_k to None if using top_p
if top_p > 0:
top_k = None
else:
top_p = None
with torch.no_grad():
if top_k!=None:
# it expects top_k to be integer not float
top_k = int(top_k)
output_text = generate( # function uses `with torch.no_grad()` internally already
model=model,
prompt=prompt,
tokenizer=tokenizer,
max_new_tokens=max_new_tokens,
top_p=top_p,# top p sampling is prefered over top k if top_p != None
top_k=top_k,
temperature=0.7,
repetition_penalty=repetition_penalty, # New parameter: Repetition penalty factor
penalize_len_below=penalize_len_below # New parameter: Minimum content length for penalizing EOT token.
)
return output_text
css = """
#bright-textbox {
background-color: #ffeb3b; /* Bright yellow */
color: #000000; /* Black text for contrast */
border: 2px solid #fbc02d; /* Slightly darker yellow for the border */
font-size: 16px;
padding: 10px;
border-radius: 5px;
}
"""
# Create Gradio interface
with gr.Blocks(title="Nepali GPT-2 Text Generator", css=css) as interface:
gr.Markdown("# Nepali GPT-2 Text Generator")
gr.Markdown("Enter Nepali (नेपाली) text to generate content using the custom GPT2-Nepali model.")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(
label="Prompt",
placeholder="यहाँ नेपाली मा इन्पुट दिनु होस् ... (please Enter Nepali text here...)" #,
# value="रामले भात"
)
max_tokens = gr.Slider(minimum=1, maximum=512, value=50, step=1, label="Max New Tokens")
with gr.Row():
with gr.Column():
temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty")
with gr.Column():
top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K (set to 0 to use Top P)")
top_p = gr.Slider(minimum=0, maximum=1.0, value=0.9, step=0.05, label="Top P (set above 0 to use instead of Top K)")
min_length = gr.Slider(minimum=1, maximum=200, value=50, step=1, label="Minimum Length Penalty")
generate_btn = gr.Button("Generate Text")
with gr.Column():
output = gr.Textbox(label="Generated Text", lines=10)
# Add examples if you have any
gr.Examples(
examples=[
["रामले भात", 50, 50, 0, 0.7, 1.2, 50],
["नेपाल एउटा", 100, 0, 0.9, 0.8, 1.2, 100],
["नेपाल का वर्तमान प्रधानमन्त्री ", 100, 0, 0.9, 0.8, 1.2, 100],
["भारतीय प्रधानमन्त्री ", 100, 0, 0.9, 0.8, 1.2, 100],
["अमिरिकी रास्ट्रपति डोनाल्ड", 100, 0, 0.9, 0.8, 1.2, 100],
],
inputs=[prompt, max_tokens, top_k, top_p, temperature, repetition_penalty, min_length],
outputs=output,
fn=generate_text,
cache_examples=True,
)
generate_btn.click(
fn=generate_text,
inputs=[prompt, max_tokens, top_p, top_k, temperature, repetition_penalty, min_length],
outputs=output
)
'''
'''
interface.launch()