Spaces:

lakshyaag
/

w3d1-legal-summarizer-app

Paused

App Files Files Community

w3d1-legal-summarizer-app / app.py

lakshyaag

Update app.py

20c7176 verified 9 months ago

raw

history blame contribute delete

3.17 kB

	import os

	import chainlit as cl # importing chainlit for our app
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	BitsAndBytesConfig,
	)
	from peft import PeftModel, PeftConfig

	import bitsandbytes as bnb

	os.environ["CUDA_VISIBLE_DEVICES"] = "0"


	# Prompt Templates
	INSTRUCTION_PROMPT_TEMPLATE = """<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>
	Please convert the following legal content into a human-readable summary<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>
	[LEGAL_DOC]
	{input}
	[END_LEGAL_DOC]<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>
	"""

	RESPONSE_TEMPLATE = """
	{summary}<\|eot_id\|>
	"""

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.float16,
	)

	config = PeftConfig.from_pretrained("lakshyaag/llama38binstruct_summarize")

	base_model = AutoModelForCausalLM.from_pretrained(
	"NousResearch/Meta-Llama-3-8B-Instruct",
	quantization_config=bnb_config,
	device_map="auto",
	)

	model = PeftModel.from_pretrained(base_model, "lakshyaag/llama38binstruct_summarize")
	model.merge_and_unload()


	# Move model to GPU if available
	if torch.cuda.is_available():
	model = model.to("cuda")

	tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")

	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"


	def create_prompt(sample, include_response=False):
	"""
	Parameters:
	- sample: dict representing row of dataset
	- include_response: bool
	Functionality:
	This function should build the Python str `full_prompt`.
	If `include_response` is true, it should include the summary -
	else it should not contain the summary (useful for prompting) and testing
	Returns:
	- full_prompt: str
	"""

	full_prompt = INSTRUCTION_PROMPT_TEMPLATE.format(input=sample["original_text"])

	if include_response:
	full_prompt += RESPONSE_TEMPLATE.format(summary=sample["reference_summary"])

	full_prompt += "<\|end_of_text\|>"

	return full_prompt


	@cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
	async def main(message: cl.Message):
	prompt = create_prompt({"original_text": message.content}, include_response=False)

	# convert str input into tokenized input
	encoded_input = tokenizer(prompt, return_tensors="pt")

	# send the tokenized inputs to our GPU
	model_inputs = encoded_input.to("cuda" if torch.cuda.is_available() else "cpu")

	# generate response and set desired generation parameters
	generated_ids = model.generate(
	**model_inputs,
	max_new_tokens=256,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	# decode output from tokenized output to str output
	decoded_output = tokenizer.batch_decode(generated_ids)

	# return only the generated response (not the prompt) as output
	response = decoded_output[0].split("<\|end_header_id\|>")[-1]

	msg = cl.Message(content=response)

	await msg.send()


	if __name__ == "__main__":
	cl.run()