Spaces:
Paused
Paused
back to phi
Browse files
README.md
CHANGED
@@ -10,6 +10,7 @@ pinned: false
|
|
10 |
preload_from_hub:
|
11 |
- "microsoft/phi-2"
|
12 |
- "BAAI/bge-small-en-v1.5"
|
|
|
|
|
13 |
---
|
14 |
|
15 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
10 |
preload_from_hub:
|
11 |
- "microsoft/phi-2"
|
12 |
- "BAAI/bge-small-en-v1.5"
|
13 |
+
- "HuggingFaceH4/zephyr-7b-alpha"
|
14 |
+
- "meta-llama/Meta-Llama-3-8B"
|
15 |
---
|
16 |
|
|
app.py
CHANGED
@@ -20,12 +20,13 @@ CHEAPMODE = torch.cuda.is_available()
|
|
20 |
# LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
|
21 |
|
22 |
config = {
|
23 |
-
"LLM": "meta-llama/Meta-Llama-3-8B",
|
24 |
-
|
|
|
25 |
"embeddings": "BAAI/bge-small-en-v1.5",
|
26 |
"similarity_top_k": 2,
|
27 |
"context_window": 4048,
|
28 |
-
"max_new_tokens":
|
29 |
"temperature": 0.7,
|
30 |
"top_k": 5,
|
31 |
"top_p": 0.95,
|
@@ -42,17 +43,17 @@ title = "Ask my thesis: Intelligent Automation for AI-Driven Document Understand
|
|
42 |
title = center_element(title)
|
43 |
description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.
|
44 |
|
45 |
-
<div class="
|
46 |
<a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
|
47 |
<img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
|
48 |
-
title="Thesis.pdf" alt="Ideogram image generated with prompt engineering"/></a>
|
49 |
-
</div>
|
50 |
|
51 |
Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace
|
52 |
|
53 |
-
Spoiler: a RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳
|
54 |
"""
|
55 |
-
|
56 |
description = center_element(description)
|
57 |
|
58 |
def messages_to_prompt(messages):
|
@@ -105,6 +106,7 @@ def load_RAG_pipeline(config):
|
|
105 |
# Llama-index
|
106 |
Settings.llm = llm
|
107 |
Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
|
|
|
108 |
Settings.chunk_size = config["chunk_size"]
|
109 |
Settings.chunk_overlap = config["chunk_overlap"]
|
110 |
|
@@ -125,23 +127,16 @@ default_query_engine = load_RAG_pipeline(config)
|
|
125 |
|
126 |
# These are placeholder functions to simulate the behavior of the RAG setup.
|
127 |
# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
|
128 |
-
def get_answer(question,
|
129 |
# Here you should implement the logic to generate an answer based on the question and the document.
|
130 |
# For example, you could use a machine learning model for RAG.
|
131 |
# answer = "This is a placeholder answer."
|
132 |
# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
|
133 |
|
134 |
# if temperature or nucleus sampling or max_tokens != as in config, recall query engine
|
135 |
-
|
136 |
-
temperature != config["temperature"]
|
137 |
-
or nucleus_sampling != config["top_p"]
|
138 |
-
or max_tokens != config["max_new_tokens"]
|
139 |
-
):
|
140 |
-
config["temperature"] = temperature
|
141 |
-
config["top_p"] = nucleus_sampling
|
142 |
-
config["max_new_tokens"] = max_tokens
|
143 |
-
query_engine = load_RAG_pipeline(config)
|
144 |
response = query_engine.query(question)
|
|
|
145 |
return response
|
146 |
|
147 |
|
@@ -156,32 +151,87 @@ def get_answer_page(response):
|
|
156 |
|
157 |
# Create the gr.Interface function
|
158 |
def ask_my_thesis(
|
159 |
-
question,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
):
|
161 |
print(f"Got Q: {question}")
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
image, answer_page = get_answer_page(answer)
|
164 |
-
return answer, image, answer_page
|
165 |
|
166 |
|
167 |
# Set up the interface options based on the design in the image.
|
168 |
output_image = gr.Image(label="Answer Page")
|
169 |
|
170 |
# examples
|
171 |
-
examples = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
iface = gr.Interface(
|
174 |
fn=ask_my_thesis,
|
175 |
inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
|
176 |
-
additional_inputs=
|
177 |
-
gr.Slider(0, 1, value=0.7, label="Temperature"),
|
178 |
-
gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
|
179 |
-
gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
|
180 |
-
],
|
181 |
outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
|
|
|
182 |
title=title,
|
183 |
description=description,
|
184 |
-
allow_flagging="
|
|
|
185 |
)
|
186 |
# https://github.com/gradio-app/gradio/issues/4309
|
187 |
|
|
|
20 |
# LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
|
21 |
|
22 |
config = {
|
23 |
+
# "LLM": "meta-llama/Meta-Llama-3-8B",
|
24 |
+
"LLM": "microsoft/phi-2",
|
25 |
+
# "LLM": "HuggingFaceH4/zephyr-7b-alpha",
|
26 |
"embeddings": "BAAI/bge-small-en-v1.5",
|
27 |
"similarity_top_k": 2,
|
28 |
"context_window": 4048,
|
29 |
+
"max_new_tokens": 200,
|
30 |
"temperature": 0.7,
|
31 |
"top_k": 5,
|
32 |
"top_p": 0.95,
|
|
|
43 |
title = center_element(title)
|
44 |
description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.
|
45 |
|
46 |
+
<div class="center">
|
47 |
<a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
|
48 |
<img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
|
49 |
+
title="Thesis.pdf" alt="Ideogram image generated with prompt engineering" width="500" class="center"/></a>
|
50 |
+
</div> Click the visual above to be redirected to the PDF of the manuscript.
|
51 |
|
52 |
Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace
|
53 |
|
54 |
+
Spoiler: a quickly hacked together RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳ (10s+)
|
55 |
"""
|
56 |
+
|
57 |
description = center_element(description)
|
58 |
|
59 |
def messages_to_prompt(messages):
|
|
|
106 |
# Llama-index
|
107 |
Settings.llm = llm
|
108 |
Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
|
109 |
+
print(Settings)
|
110 |
Settings.chunk_size = config["chunk_size"]
|
111 |
Settings.chunk_overlap = config["chunk_overlap"]
|
112 |
|
|
|
127 |
|
128 |
# These are placeholder functions to simulate the behavior of the RAG setup.
|
129 |
# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
|
130 |
+
def get_answer(question, config, query_engine=default_query_engine):
|
131 |
# Here you should implement the logic to generate an answer based on the question and the document.
|
132 |
# For example, you could use a machine learning model for RAG.
|
133 |
# answer = "This is a placeholder answer."
|
134 |
# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
|
135 |
|
136 |
# if temperature or nucleus sampling or max_tokens != as in config, recall query engine
|
137 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
response = query_engine.query(question)
|
139 |
+
print(f"A: {response}")
|
140 |
return response
|
141 |
|
142 |
|
|
|
151 |
|
152 |
# Create the gr.Interface function
|
153 |
def ask_my_thesis(
|
154 |
+
question,
|
155 |
+
LLM=config["LLM"],
|
156 |
+
embeddings=config["embeddings"],
|
157 |
+
similarity_top_k=config["similarity_top_k"],
|
158 |
+
context_window=config["context_window"],
|
159 |
+
max_new_tokens=config["max_new_tokens"],
|
160 |
+
temperature=config["temperature"],
|
161 |
+
top_k=config["top_k"],
|
162 |
+
top_p=config["top_p"],
|
163 |
+
chunk_size=config["chunk_size"],
|
164 |
+
chunk_overlap=config["chunk_overlap"],
|
165 |
):
|
166 |
print(f"Got Q: {question}")
|
167 |
+
query_engine = default_query_engine
|
168 |
+
|
169 |
+
# if any change in kwargs
|
170 |
+
# Check if any of the kwargs have changed
|
171 |
+
if (
|
172 |
+
temperature != config["temperature"]
|
173 |
+
or top_p != config["top_p"]
|
174 |
+
or max_new_tokens != config["max_new_tokens"]
|
175 |
+
or LLM != config["LLM"]
|
176 |
+
or embeddings != config["embeddings"]
|
177 |
+
or similarity_top_k != config["similarity_top_k"]
|
178 |
+
or context_window != config["context_window"]
|
179 |
+
or top_k != config["top_k"]
|
180 |
+
or chunk_size != config["chunk_size"]
|
181 |
+
or chunk_overlap != config["chunk_overlap"]
|
182 |
+
):
|
183 |
+
# Update the config dictionary with the new values
|
184 |
+
config["temperature"] = temperature
|
185 |
+
config["top_p"] = top_p
|
186 |
+
config["max_new_tokens"] = max_new_tokens
|
187 |
+
# config["LLM"] = LLM
|
188 |
+
# config["embeddings"] = embeddings
|
189 |
+
config["similarity_top_k"] = similarity_top_k
|
190 |
+
config["context_window"] = context_window
|
191 |
+
config["top_k"] = top_k
|
192 |
+
config["chunk_size"] = chunk_size
|
193 |
+
config["chunk_overlap"] = chunk_overlap
|
194 |
+
query_engine = load_RAG_pipeline(config)
|
195 |
+
|
196 |
+
answer = get_answer(question, config, query_engine=query_engine)
|
197 |
image, answer_page = get_answer_page(answer)
|
198 |
+
return answer.response, image, answer_page
|
199 |
|
200 |
|
201 |
# Set up the interface options based on the design in the image.
|
202 |
output_image = gr.Image(label="Answer Page")
|
203 |
|
204 |
# examples
|
205 |
+
examples = [
|
206 |
+
["What model is state-of-the-art on DUDE?"],
|
207 |
+
["Why is knowledge distillation interesting?"],
|
208 |
+
["What is ANLS?"],
|
209 |
+
]
|
210 |
+
# Define additional Gradio input elements
|
211 |
+
additional_inputs = [
|
212 |
+
# gr.Input("text", label="Question"),
|
213 |
+
# gr.Input("text", label="LLM", value=config["LLM"]),
|
214 |
+
# gr.Input("text", label="Embeddings", value=config["embeddings"]),
|
215 |
+
gr.Slider(1, 5, value=config["similarity_top_k"], label="Similarity Top K"),
|
216 |
+
gr.Slider(512, 8048, value=config["context_window"], label="Context Window"),
|
217 |
+
gr.Slider(20, 250, value=config["max_new_tokens"], label="Max New Tokens"),
|
218 |
+
gr.Slider(0, 1, value=config["temperature"], label="Temperature"),
|
219 |
+
gr.Slider(1, 10, value=config["top_k"], label="Top K"),
|
220 |
+
gr.Slider(0, 1, value=config["top_p"], label="Nucleus Sampling"),
|
221 |
+
gr.Slider(128, 4024, value=config["chunk_size"], label="Chunk Size"),
|
222 |
+
gr.Slider(0, 200, value=config["chunk_overlap"], label="Chunk Overlap"),
|
223 |
+
]
|
224 |
|
225 |
iface = gr.Interface(
|
226 |
fn=ask_my_thesis,
|
227 |
inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
|
228 |
+
additional_inputs=additional_inputs,
|
|
|
|
|
|
|
|
|
229 |
outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
|
230 |
+
examples=examples,
|
231 |
title=title,
|
232 |
description=description,
|
233 |
+
allow_flagging="auto",
|
234 |
+
cache_examples=True,
|
235 |
)
|
236 |
# https://github.com/gradio-app/gradio/issues/4309
|
237 |
|