Spaces:

jordyvl
/

ask_my_thesis

Paused

App Files Files Community

jordyvl commited on Apr 19

Commit

50a7785

•

1 Parent(s): 54bbae9

tryng with llama3

Browse files

Files changed (1) hide show

app.py +39 -16

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # TODO: question samples
 # TEST: with and without GPU instance
 # TODO: visual questions on page image (in same app)?
-## locally check timings of start-up code and see if I cannot pass the parameters to creating vector engine
 import torch
 from llama_index.llms.huggingface import HuggingFaceLLM
@@ -20,17 +20,41 @@ CHEAPMODE = torch.cuda.is_available()
 # LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
 config = {
-    "LLM": "microsoft/phi-2",
     "embeddings": "BAAI/bge-small-en-v1.5",
     "similarity_top_k": 2,
-    "context_window": 2048,
     "max_new_tokens": 150,
     "temperature": 0.7,
     "top_k": 5,
     "top_p": 0.95,
 }
 def messages_to_prompt(messages):
     prompt = ""
     for message in messages:
@@ -54,7 +78,7 @@ def messages_to_prompt(messages):
 def load_RAG_pipeline(config):
     # LLM
-    quantization_config = None # dirty fix for CPU/GPU support
     if torch.cuda.is_available():
         from transformers import BitsAndBytesConfig
@@ -81,17 +105,17 @@ def load_RAG_pipeline(config):
     # Llama-index
     Settings.llm = llm
     Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
-    # Settings.chunk_size = 512
-    # Settings.chunk_overlap = 50
     # raw data
     documents = SimpleDirectoryReader("assets/txts").load_data()
     vector_index = VectorStoreIndex.from_documents(documents)
     # vector_index.persist(persist_dir="vectors")
     # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
-    # summary_index = SummaryIndex.from_documents(documents)
     query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
     return query_engine
@@ -131,7 +155,9 @@ def get_answer_page(response):
 # Create the gr.Interface function
-def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
     print(f"Got Q: {question}")
     answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
     image, answer_page = get_answer_page(answer)
@@ -142,22 +168,19 @@ def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
 output_image = gr.Image(label="Answer Page")
 # examples
 iface = gr.Interface(
     fn=ask_my_thesis,
-    inputs=[
-        gr.Textbox(label="Question", placeholder="Type your question here..."),
         gr.Slider(0, 1, value=0.7, label="Temperature"),
         gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
         gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
     ],
     outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
-    title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
-    description=r"""Chat with the thesis manuscript by asking questions and receive answers with multimodal references.
-    Spoiler: a RAG application with a >1B LLM and vector store can be quite slow on a 290 page document :hourglass:
-    """,
-    css="body { background-image: url('https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png'); }",
     allow_flagging="never",
 )
 # https://github.com/gradio-app/gradio/issues/4309

 # TODO: question samples
 # TEST: with and without GPU instance
 # TODO: visual questions on page image (in same app)?
+# expose more parameters
 import torch
 from llama_index.llms.huggingface import HuggingFaceLLM
 # LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
 config = {
+    "LLM": "meta-llama/Meta-Llama-3-8B",
+    # "LLM": "microsoft/phi-2",
     "embeddings": "BAAI/bge-small-en-v1.5",
     "similarity_top_k": 2,
+    "context_window": 4048,
     "max_new_tokens": 150,
     "temperature": 0.7,
     "top_k": 5,
     "top_p": 0.95,
+    "chunk_size": 512,
+    "chunk_overlap": 50,
 }
+def center_element(el):
+    return f"<div style='text-align: center;'>{el}</div>"
+title = "Ask my thesis: Intelligent Automation for AI-Driven Document Understanding"
+title = center_element(title)
+description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.
+    <div class="span1">
+    <a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
+        <img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
+              title="Thesis.pdf" alt="Ideogram image generated with prompt engineering"/></a>
+    </div>
+    Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace
+    Spoiler: a RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳
+    """
+# width="250"
+description = center_element(description)
 def messages_to_prompt(messages):
     prompt = ""
     for message in messages:
 def load_RAG_pipeline(config):
     # LLM
+    quantization_config = None  # dirty fix for CPU/GPU support
     if torch.cuda.is_available():
         from transformers import BitsAndBytesConfig
     # Llama-index
     Settings.llm = llm
     Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
+    Settings.chunk_size = config["chunk_size"]
+    Settings.chunk_overlap = config["chunk_overlap"]
     # raw data
     documents = SimpleDirectoryReader("assets/txts").load_data()
     vector_index = VectorStoreIndex.from_documents(documents)
+    # summary_index = SummaryIndex.from_documents(documents)
     # vector_index.persist(persist_dir="vectors")
     # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
     query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
     return query_engine
 # Create the gr.Interface function
+def ask_my_thesis(
+    question, temperature=config["temperature"], nucleus_sampling=config["top_p"], max_tokens=config["max_new_tokens"]
+):
     print(f"Got Q: {question}")
     answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
     image, answer_page = get_answer_page(answer)
 output_image = gr.Image(label="Answer Page")
 # examples
+examples = [["Who is Jordy Van Landeghem"], []]
 iface = gr.Interface(
     fn=ask_my_thesis,
+    inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
+    additional_inputs=[
         gr.Slider(0, 1, value=0.7, label="Temperature"),
         gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
         gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
     ],
     outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
+    title=title,
+    description=description,
     allow_flagging="never",
 )
 # https://github.com/gradio-app/gradio/issues/4309