# -*- coding: utf-8 -*- | |
import locale | |
print(locale.getpreferredencoding()) | |
from transformers import AutoConfig, AutoModelForCausalLM,pipeline,AutoTokenizer | |
from datasets import load_dataset | |
MODEL_DIR = "/home/deepak/sources/gpt2-tamil/gpt2-tamil/" | |
#get prompt from dataset, will be replaced by manual prompt once I figure out how to render tamil font | |
dataset = load_dataset("oscar", "unshuffled_deduplicated_ta", split="train") | |
id =232 | |
print(dataset[id]['text']) | |
tamil_prompt =dataset[id]['text'] | |
# Get configuration and the model | |
config = AutoConfig.from_pretrained(MODEL_DIR) | |
model = AutoModelForCausalLM.from_config(config) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) | |
generator= pipeline('text-generation', model=model, tokenizer=tokenizer) | |
model_output = generator(tamil_prompt, max_length=30, num_return_sequences=5) | |
print(model_output) | |