In [None]:
pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

# Load GPT-Neo tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token

# Load the subset dataset
dataset = load_dataset("json", data_files="financial_accounting_subset.json")

# Inspect the first row to verify the structure of the dataset
first_row = next(iter(dataset["train"]))
print("Keys in the dataset:", first_row.keys())

# Define input and output columns based on your dataset structure
# Adjust these as necessary for your task
input_column = "Description"  # Example input: financial transaction description
output_column = "Category"  # Example output: transaction category

# Check if the specified columns exist in the dataset
if input_column not in first_row or output_column not in first_row:
    raise KeyError(
        f"Columns '{input_column}' or '{output_column}' not found in dataset. "
        f"Available keys: {list(first_row.keys())}"
    )

# Define the tokenization function
def tokenize_function(examples):
    inputs = examples[input_column]
    targets = examples[output_column]
    model_inputs = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            truncation=True,
            padding="max_length",
            max_length=512
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Optional: Save the tokenized dataset for inspection
tokenized_dataset.save_to_disk("./tokenized_dataset")

print("Tokenization complete!")


Keys in the dataset: dict_keys(['Date', 'Account', 'Description', 'Debit', 'Credit', 'Category', 'Transaction_Type', 'Customer_Vendor', 'Payment_Method', 'Reference'])


Map:   0%|          | 0/50001 [00:00<?, ? examples/s]



Saving the dataset (0/1 shards):   0%|          | 0/50001 [00:00<?, ? examples/s]

Tokenization complete!


In [6]:
# Define a custom data collator to handle loss computation
class CustomDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features):
        # Ensure correct padding and label creation
        batch = super().__call__(features)
        labels = batch['input_ids'].clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in the loss computation
        batch['labels'] = labels
        return batch

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt_fine_tuned_model",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=500,
    do_train=True,
    do_eval=True,
    report_to="none",  # Change to "wandb" if using Weights & Biases
)

# Initialize the Trainer with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Use the tokenized training dataset
    eval_dataset=tokenized_dataset["train"],   # Optional: if you have a validation set
    data_collator=CustomDataCollator(tokenizer),  # Use custom data collator
)

# Start training
trainer.train()

# Save the fine-tuned model
trainer.save_model("./gpt_fine_tuned_model")


Step,Training Loss
500,5.6296
1000,5.4991
1500,5.4651
2000,5.4455
2500,5.4194
3000,5.4338
3500,5.4172
4000,5.3853
4500,5.3852
5000,5.3869


Step,Training Loss
500,5.6296
1000,5.4991
1500,5.4651
2000,5.4455
2500,5.4194
3000,5.4338
3500,5.4172
4000,5.3853
4500,5.3852
5000,5.3869


In [7]:
from transformers import AutoTokenizer

# Save model and tokenizer
model.save_pretrained("path_to_save_model")
tokenizer.save_pretrained("path_to_save_model")


('path_to_save_model/tokenizer_config.json',
 'path_to_save_model/special_tokens_map.json',
 'path_to_save_model/vocab.json',
 'path_to_save_model/merges.txt',
 'path_to_save_model/added_tokens.json',
 'path_to_save_model/tokenizer.json')

In [None]:
results = trainer.evaluate()
print("Evaluation Results:", results)


In [None]:
test_inputs = [
    "Describe a transaction for a purchase at a retail store.",  # Example prompt
    "Explain a bank deposit transaction.",  # Another example
]

for input_text in test_inputs:
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {input_text}")
    print(f"Generated Output: {generated_text}")

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="path_to_save_model",  # Path where the model is saved
    repo_id="your_username/your_model_name",
    repo_type="model"
)


In [None]:
pip install fastapi uvicorn

In [None]:
from fastapi import FastAPI, Request
from transformers import AutoModelForCausalLM, AutoTokenizer

app = FastAPI()

model = AutoModelForCausalLM.from_pretrained("path_to_save_model")
tokenizer = AutoTokenizer.from_pretrained("path_to_save_model")

@app.post("/generate")
async def generate(request: Request):
    data = await request.json()
    input_text = data.get("input")
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=50)
    return {"output": tokenizer.decode(outputs[0], skip_special_tokens=True)}

# Run using uvicorn
# uvicorn your_script_name:app --reload

In [None]:
pip install streamlit

In [None]:
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("path_to_save_model")
tokenizer = AutoTokenizer.from_pretrained("path_to_save_model")

st.title("Accounting GPT")
input_text = st.text_input("Enter a prompt:")
if st.button("Generate"):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=50)
    st.write(tokenizer.decode(outputs[0], skip_special_tokens=True))
