# TODO: return all pages used to form answer import torch from transformers import BitsAndBytesConfig from llama_index.llms.huggingface import HuggingFaceLLM from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import SimpleDirectoryReader from llama_index.core import VectorStoreIndex, SummaryIndex from llama_index.core.prompts import PromptTemplate from llama_index.core import Settings from PIL import Image import gradio as gr def messages_to_prompt(messages): prompt = "" for message in messages: if message.role == "system": m = "You are an expert in the research field of document understanding, bayesian deep learning and neural networks." prompt += f"<|system|>\n{m}\n" elif message.role == "user": prompt += f"<|user|>\n{message.content}\n" elif message.role == "assistant": prompt += f"<|assistant|>\n{message.content}\n" # ensure we start with a system prompt, insert blank if needed if not prompt.startswith("<|system|>\n"): prompt = "<|system|>\n\n" + prompt # add final assistant prompt prompt = prompt + "<|assistant|>\n" return prompt def load_RAG_pipeline(): # LLM quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) llm = HuggingFaceLLM( model_name="HuggingFaceH4/zephyr-7b-alpha", tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", query_wrapper_prompt=PromptTemplate("<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n"), context_window=3900, max_new_tokens=256, model_kwargs={"quantization_config": quantization_config}, # tokenizer_kwargs={}, generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, messages_to_prompt=messages_to_prompt, device_map="auto", ) # Llama-index Settings.llm = llm Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") # Settings.chunk_size = 512 # Settings.chunk_overlap = 50 # raw data documents = SimpleDirectoryReader("assets/txts").load_data() vector_index = VectorStoreIndex.from_documents(documents) # vector_index.persist(persist_dir="vectors") # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html # summary_index = SummaryIndex.from_documents(documents) query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3) return query_engine query_engine = load_RAG_pipeline() # These are placeholder functions to simulate the behavior of the RAG setup. # You would need to implement these with the actual logic to retrieve and generate answers based on the document. def get_answer(question, temperature, nucleus_sampling, max_tokens): # Here you should implement the logic to generate an answer based on the question and the document. # For example, you could use a machine learning model for RAG. # answer = "This is a placeholder answer." # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations response = query_engine.query(question) return response def get_answer_page(response): # Implement logic to retrieve the page number or an image of the page with the answer. # best image best_match = response.source_nodes[0].metadata["file_path"] answer_page = float(int(best_match[-8:-4])) image = Image.open(best_match.replace("txt", "png")) return image, answer_page # Create the gr.Interface function def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens): answer = get_answer(question, temperature, nucleus_sampling, max_tokens) image, answer_page = get_answer_page(answer) return answer, image, answer_page # Set up the interface options based on the design in the image. output_image = gr.Image(label="Answer Page") # examples iface = gr.Interface( fn=ask_my_thesis, inputs=[ gr.Textbox(label="Question", placeholder="Type your question here..."), gr.Slider(0, 1, value=0.7, label="Temperature"), gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"), gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"), ], outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()], title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding", description=r"""Chat with the thesis manuscript: ask questions and receive answers with multimodal references (WIP). Spoiler: RAG application with LLM and embedding vector store can be quite slow on a 290 page document ;D """, allow_flagging="never", ) # https://github.com/gradio-app/gradio/issues/4309 # https://discuss.huggingface.co/t/add-background-image/16381/4 background image # Start the application. if __name__ == "__main__": iface.launch()