jordyvl commited on
Commit
50a7785
1 Parent(s): 54bbae9

tryng with llama3

Browse files
Files changed (1) hide show
  1. app.py +39 -16
app.py CHANGED
@@ -2,7 +2,7 @@
2
  # TODO: question samples
3
  # TEST: with and without GPU instance
4
  # TODO: visual questions on page image (in same app)?
5
- ## locally check timings of start-up code and see if I cannot pass the parameters to creating vector engine
6
 
7
  import torch
8
  from llama_index.llms.huggingface import HuggingFaceLLM
@@ -20,17 +20,41 @@ CHEAPMODE = torch.cuda.is_available()
20
  # LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
21
 
22
  config = {
23
- "LLM": "microsoft/phi-2",
 
24
  "embeddings": "BAAI/bge-small-en-v1.5",
25
  "similarity_top_k": 2,
26
- "context_window": 2048,
27
  "max_new_tokens": 150,
28
  "temperature": 0.7,
29
  "top_k": 5,
30
  "top_p": 0.95,
 
 
31
  }
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def messages_to_prompt(messages):
35
  prompt = ""
36
  for message in messages:
@@ -54,7 +78,7 @@ def messages_to_prompt(messages):
54
 
55
  def load_RAG_pipeline(config):
56
  # LLM
57
- quantization_config = None # dirty fix for CPU/GPU support
58
  if torch.cuda.is_available():
59
  from transformers import BitsAndBytesConfig
60
 
@@ -81,17 +105,17 @@ def load_RAG_pipeline(config):
81
  # Llama-index
82
  Settings.llm = llm
83
  Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
84
- # Settings.chunk_size = 512
85
- # Settings.chunk_overlap = 50
86
 
87
  # raw data
88
  documents = SimpleDirectoryReader("assets/txts").load_data()
89
  vector_index = VectorStoreIndex.from_documents(documents)
 
90
 
91
  # vector_index.persist(persist_dir="vectors")
92
  # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
93
 
94
- # summary_index = SummaryIndex.from_documents(documents)
95
  query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
96
  return query_engine
97
 
@@ -131,7 +155,9 @@ def get_answer_page(response):
131
 
132
 
133
  # Create the gr.Interface function
134
- def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
 
 
135
  print(f"Got Q: {question}")
136
  answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
137
  image, answer_page = get_answer_page(answer)
@@ -142,22 +168,19 @@ def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
142
  output_image = gr.Image(label="Answer Page")
143
 
144
  # examples
 
145
 
146
  iface = gr.Interface(
147
  fn=ask_my_thesis,
148
- inputs=[
149
- gr.Textbox(label="Question", placeholder="Type your question here..."),
150
  gr.Slider(0, 1, value=0.7, label="Temperature"),
151
  gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
152
  gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
153
  ],
154
  outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
155
- title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
156
- description=r"""Chat with the thesis manuscript by asking questions and receive answers with multimodal references.
157
-
158
- Spoiler: a RAG application with a >1B LLM and vector store can be quite slow on a 290 page document :hourglass:
159
- """,
160
- css="body { background-image: url('https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png'); }",
161
  allow_flagging="never",
162
  )
163
  # https://github.com/gradio-app/gradio/issues/4309
 
2
  # TODO: question samples
3
  # TEST: with and without GPU instance
4
  # TODO: visual questions on page image (in same app)?
5
+ # expose more parameters
6
 
7
  import torch
8
  from llama_index.llms.huggingface import HuggingFaceLLM
 
20
  # LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
21
 
22
  config = {
23
+ "LLM": "meta-llama/Meta-Llama-3-8B",
24
+ # "LLM": "microsoft/phi-2",
25
  "embeddings": "BAAI/bge-small-en-v1.5",
26
  "similarity_top_k": 2,
27
+ "context_window": 4048,
28
  "max_new_tokens": 150,
29
  "temperature": 0.7,
30
  "top_k": 5,
31
  "top_p": 0.95,
32
+ "chunk_size": 512,
33
+ "chunk_overlap": 50,
34
  }
35
 
36
 
37
+ def center_element(el):
38
+ return f"<div style='text-align: center;'>{el}</div>"
39
+
40
+
41
+ title = "Ask my thesis: Intelligent Automation for AI-Driven Document Understanding"
42
+ title = center_element(title)
43
+ description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.
44
+
45
+ <div class="span1">
46
+ <a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
47
+ <img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
48
+ title="Thesis.pdf" alt="Ideogram image generated with prompt engineering"/></a>
49
+ </div>
50
+
51
+ Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace
52
+
53
+ Spoiler: a RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳
54
+ """
55
+ # width="250"
56
+ description = center_element(description)
57
+
58
  def messages_to_prompt(messages):
59
  prompt = ""
60
  for message in messages:
 
78
 
79
  def load_RAG_pipeline(config):
80
  # LLM
81
+ quantization_config = None # dirty fix for CPU/GPU support
82
  if torch.cuda.is_available():
83
  from transformers import BitsAndBytesConfig
84
 
 
105
  # Llama-index
106
  Settings.llm = llm
107
  Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
108
+ Settings.chunk_size = config["chunk_size"]
109
+ Settings.chunk_overlap = config["chunk_overlap"]
110
 
111
  # raw data
112
  documents = SimpleDirectoryReader("assets/txts").load_data()
113
  vector_index = VectorStoreIndex.from_documents(documents)
114
+ # summary_index = SummaryIndex.from_documents(documents)
115
 
116
  # vector_index.persist(persist_dir="vectors")
117
  # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
118
 
 
119
  query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
120
  return query_engine
121
 
 
155
 
156
 
157
  # Create the gr.Interface function
158
+ def ask_my_thesis(
159
+ question, temperature=config["temperature"], nucleus_sampling=config["top_p"], max_tokens=config["max_new_tokens"]
160
+ ):
161
  print(f"Got Q: {question}")
162
  answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
163
  image, answer_page = get_answer_page(answer)
 
168
  output_image = gr.Image(label="Answer Page")
169
 
170
  # examples
171
+ examples = [["Who is Jordy Van Landeghem"], []]
172
 
173
  iface = gr.Interface(
174
  fn=ask_my_thesis,
175
+ inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
176
+ additional_inputs=[
177
  gr.Slider(0, 1, value=0.7, label="Temperature"),
178
  gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
179
  gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
180
  ],
181
  outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
182
+ title=title,
183
+ description=description,
 
 
 
 
184
  allow_flagging="never",
185
  )
186
  # https://github.com/gradio-app/gradio/issues/4309