Spaces:

jordyvl
/

ask_my_thesis

Paused

App Files Files Community

ask_my_thesis / app.py

jordyvl

tryng with llama3

50a7785 8 months ago

raw

history blame

7.17 kB

	# TODO: return all pages used to form answer
	# TODO: question samples
	# TEST: with and without GPU instance
	# TODO: visual questions on page image (in same app)?
	# expose more parameters

	import torch
	from llama_index.llms.huggingface import HuggingFaceLLM
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.core import SimpleDirectoryReader
	from llama_index.core import VectorStoreIndex, SummaryIndex
	from llama_index.core.prompts import PromptTemplate
	from llama_index.core import Settings
	from PIL import Image

	import gradio as gr

	CHEAPMODE = torch.cuda.is_available()

	# LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"

	config = {
	"LLM": "meta-llama/Meta-Llama-3-8B",
	# "LLM": "microsoft/phi-2",
	"embeddings": "BAAI/bge-small-en-v1.5",
	"similarity_top_k": 2,
	"context_window": 4048,
	"max_new_tokens": 150,
	"temperature": 0.7,
	"top_k": 5,
	"top_p": 0.95,
	"chunk_size": 512,
	"chunk_overlap": 50,
	}


	def center_element(el):
	return f"<div style='text-align: center;'>{el}</div>"


	title = "Ask my thesis: Intelligent Automation for AI-Driven Document Understanding"
	title = center_element(title)
	description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.

	<div class="span1">
	<a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
	<img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
	title="Thesis.pdf" alt="Ideogram image generated with prompt engineering"/></a>
	</div>

	Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace

	Spoiler: a RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳
	"""
	# width="250"
	description = center_element(description)

	def messages_to_prompt(messages):
	prompt = ""
	for message in messages:
	if message.role == "system":
	m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks."
	prompt += f"<\|system\|>\n{m}</s>\n"
	elif message.role == "user":
	prompt += f"<\|user\|>\n{message.content}</s>\n"
	elif message.role == "assistant":
	prompt += f"<\|assistant\|>\n{message.content}</s>\n"

	# ensure we start with a system prompt, insert blank if needed
	if not prompt.startswith("<\|system\|>\n"):
	prompt = "<\|system\|>\n</s>\n" + prompt

	# add final assistant prompt
	prompt = prompt + "<\|assistant\|>\n"

	return prompt


	def load_RAG_pipeline(config):
	# LLM
	quantization_config = None # dirty fix for CPU/GPU support
	if torch.cuda.is_available():
	from transformers import BitsAndBytesConfig

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	)

	llm = HuggingFaceLLM(
	model_name=config["LLM"],
	tokenizer_name=config["LLM"],
	query_wrapper_prompt=PromptTemplate("<\|system\|>\n</s>\n<\|user\|>\n{query_str}</s>\n<\|assistant\|>\n"),
	context_window=config["context_window"],
	max_new_tokens=config["max_new_tokens"],
	model_kwargs={"quantization_config": quantization_config},
	# tokenizer_kwargs={},
	generate_kwargs={"temperature": config["temperature"], "top_k": config["top_k"], "top_p": config["top_p"]},
	messages_to_prompt=messages_to_prompt,
	device_map="auto",
	)

	# Llama-index
	Settings.llm = llm
	Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
	Settings.chunk_size = config["chunk_size"]
	Settings.chunk_overlap = config["chunk_overlap"]

	# raw data
	documents = SimpleDirectoryReader("assets/txts").load_data()
	vector_index = VectorStoreIndex.from_documents(documents)
	# summary_index = SummaryIndex.from_documents(documents)

	# vector_index.persist(persist_dir="vectors")
	# https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html

	query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
	return query_engine


	default_query_engine = load_RAG_pipeline(config)


	# These are placeholder functions to simulate the behavior of the RAG setup.
	# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
	def get_answer(question, temperature, nucleus_sampling, max_tokens, query_engine=default_query_engine):
	# Here you should implement the logic to generate an answer based on the question and the document.
	# For example, you could use a machine learning model for RAG.
	# answer = "This is a placeholder answer."
	# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations

	# if temperature or nucleus sampling or max_tokens != as in config, recall query engine
	if (
	temperature != config["temperature"]
	or nucleus_sampling != config["top_p"]
	or max_tokens != config["max_new_tokens"]
	):
	config["temperature"] = temperature
	config["top_p"] = nucleus_sampling
	config["max_new_tokens"] = max_tokens
	query_engine = load_RAG_pipeline(config)
	response = query_engine.query(question)
	return response


	def get_answer_page(response):
	# Implement logic to retrieve the page number or an image of the page with the answer.
	# best image
	best_match = response.source_nodes[0].metadata["file_path"]
	answer_page = int(best_match[-8:-4])
	image = Image.open(best_match.replace("txt", "png"))
	return image, f"Navigate to page {answer_page}"


	# Create the gr.Interface function
	def ask_my_thesis(
	question, temperature=config["temperature"], nucleus_sampling=config["top_p"], max_tokens=config["max_new_tokens"]
	):
	print(f"Got Q: {question}")
	answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
	image, answer_page = get_answer_page(answer)
	return answer, image, answer_page


	# Set up the interface options based on the design in the image.
	output_image = gr.Image(label="Answer Page")

	# examples
	examples = [["Who is Jordy Van Landeghem"], []]

	iface = gr.Interface(
	fn=ask_my_thesis,
	inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
	additional_inputs=[
	gr.Slider(0, 1, value=0.7, label="Temperature"),
	gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
	gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
	],
	outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
	title=title,
	description=description,
	allow_flagging="never",
	)
	# https://github.com/gradio-app/gradio/issues/4309

	# https://discuss.huggingface.co/t/add-background-image/16381/4 background image
	# Start the application.
	if __name__ == "__main__":
	iface.launch()