Spaces:

jordyvl
/

ask_my_thesis

Paused

App Files Files Community

jordyvl commited on Apr 18

Commit

54bbae9

•

1 Parent(s): 0b94be0

Fixed README.md for preloading models when building; adjustable hyperparameters

Browse files

Files changed (2) hide show

README.md +3 -0
app.py +43 -16

README.md CHANGED Viewed

@@ -7,6 +7,9 @@ sdk: gradio
 sdk_version: 4.26.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk_version: 4.26.0
 app_file: app.py
 pinned: false
+preload_from_hub:
+  - "microsoft/phi-2"
+  - "BAAI/bge-small-en-v1.5"
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # TODO: question samples
 # TEST: with and without GPU instance
 # TODO: visual questions on page image (in same app)?
 import torch
 from llama_index.llms.huggingface import HuggingFaceLLM
@@ -14,8 +15,20 @@ from PIL import Image
 import gradio as gr
-CHEAPMODE=True
-LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
 def messages_to_prompt(messages):
@@ -39,7 +52,7 @@ def messages_to_prompt(messages):
     return prompt
-def load_RAG_pipeline():
     # LLM
     quantization_config = None # dirty fix for CPU/GPU support
     if torch.cuda.is_available():
@@ -53,45 +66,57 @@ def load_RAG_pipeline():
         )
     llm = HuggingFaceLLM(
-        model_name=LLM,
-        tokenizer_name=LLM,
         query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
-        context_window=3900,
-        max_new_tokens=256,
         model_kwargs={"quantization_config": quantization_config},
         # tokenizer_kwargs={},
-        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
         messages_to_prompt=messages_to_prompt,
         device_map="auto",
     )
     # Llama-index
     Settings.llm = llm
-    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
     # Settings.chunk_size = 512
     # Settings.chunk_overlap = 50
     # raw data
     documents = SimpleDirectoryReader("assets/txts").load_data()
     vector_index = VectorStoreIndex.from_documents(documents)
     # vector_index.persist(persist_dir="vectors")
     # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
     # summary_index = SummaryIndex.from_documents(documents)
-    query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3)
     return query_engine
-query_engine = load_RAG_pipeline()
 # These are placeholder functions to simulate the behavior of the RAG setup.
 # You would need to implement these with the actual logic to retrieve and generate answers based on the document.
-def get_answer(question, temperature, nucleus_sampling, max_tokens):
     # Here you should implement the logic to generate an answer based on the question and the document.
     # For example, you could use a machine learning model for RAG.
     # answer = "This is a placeholder answer."
     # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
     response = query_engine.query(question)
     return response
@@ -107,6 +132,7 @@ def get_answer_page(response):
 # Create the gr.Interface function
 def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
     answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
     image, answer_page = get_answer_page(answer)
     return answer, image, answer_page
@@ -122,15 +148,16 @@ iface = gr.Interface(
     inputs=[
         gr.Textbox(label="Question", placeholder="Type your question here..."),
         gr.Slider(0, 1, value=0.7, label="Temperature"),
-        gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"),
-        gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"),
     ],
     outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
     title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
-    description=r"""Chat with the thesis manuscript: ask questions and receive answers with multimodal references (WIP).
-    Spoiler: RAG application with LLM and embedding vector store can be quite slow on a 290 page document ;D
     """,
     allow_flagging="never",
 )
 # https://github.com/gradio-app/gradio/issues/4309

 # TODO: question samples
 # TEST: with and without GPU instance
 # TODO: visual questions on page image (in same app)?
+## locally check timings of start-up code and see if I cannot pass the parameters to creating vector engine
 import torch
 from llama_index.llms.huggingface import HuggingFaceLLM
 import gradio as gr
+CHEAPMODE = torch.cuda.is_available()
+# LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
+config = {
+    "LLM": "microsoft/phi-2",
+    "embeddings": "BAAI/bge-small-en-v1.5",
+    "similarity_top_k": 2,
+    "context_window": 2048,
+    "max_new_tokens": 150,
+    "temperature": 0.7,
+    "top_k": 5,
+    "top_p": 0.95,
+}
 def messages_to_prompt(messages):
     return prompt
+def load_RAG_pipeline(config):
     # LLM
     quantization_config = None # dirty fix for CPU/GPU support
     if torch.cuda.is_available():
         )
     llm = HuggingFaceLLM(
+        model_name=config["LLM"],
+        tokenizer_name=config["LLM"],
         query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
+        context_window=config["context_window"],
+        max_new_tokens=config["max_new_tokens"],
         model_kwargs={"quantization_config": quantization_config},
         # tokenizer_kwargs={},
+        generate_kwargs={"temperature": config["temperature"], "top_k": config["top_k"], "top_p": config["top_p"]},
         messages_to_prompt=messages_to_prompt,
         device_map="auto",
     )
     # Llama-index
     Settings.llm = llm
+    Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
     # Settings.chunk_size = 512
     # Settings.chunk_overlap = 50
     # raw data
     documents = SimpleDirectoryReader("assets/txts").load_data()
     vector_index = VectorStoreIndex.from_documents(documents)
     # vector_index.persist(persist_dir="vectors")
     # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
     # summary_index = SummaryIndex.from_documents(documents)
+    query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
     return query_engine
+default_query_engine = load_RAG_pipeline(config)
 # These are placeholder functions to simulate the behavior of the RAG setup.
 # You would need to implement these with the actual logic to retrieve and generate answers based on the document.
+def get_answer(question, temperature, nucleus_sampling, max_tokens, query_engine=default_query_engine):
     # Here you should implement the logic to generate an answer based on the question and the document.
     # For example, you could use a machine learning model for RAG.
     # answer = "This is a placeholder answer."
     # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
+    # if temperature or nucleus sampling or max_tokens != as in config, recall query engine
+    if (
+        temperature != config["temperature"]
+        or nucleus_sampling != config["top_p"]
+        or max_tokens != config["max_new_tokens"]
+    ):
+        config["temperature"] = temperature
+        config["top_p"] = nucleus_sampling
+        config["max_new_tokens"] = max_tokens
+        query_engine = load_RAG_pipeline(config)
     response = query_engine.query(question)
     return response
 # Create the gr.Interface function
 def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
+    print(f"Got Q: {question}")
     answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
     image, answer_page = get_answer_page(answer)
     return answer, image, answer_page
     inputs=[
         gr.Textbox(label="Question", placeholder="Type your question here..."),
         gr.Slider(0, 1, value=0.7, label="Temperature"),
+        gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
+        gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
     ],
     outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
     title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
+    description=r"""Chat with the thesis manuscript by asking questions and receive answers with multimodal references.
+    Spoiler: a RAG application with a >1B LLM and vector store can be quite slow on a 290 page document :hourglass:
     """,
+    css="body { background-image: url('https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png'); }",
     allow_flagging="never",
 )
 # https://github.com/gradio-app/gradio/issues/4309