jordyvl commited on
Commit
54bbae9
1 Parent(s): 0b94be0

Fixed README.md for preloading models when building; adjustable hyperparameters

Browse files
Files changed (2) hide show
  1. README.md +3 -0
  2. app.py +43 -16
README.md CHANGED
@@ -7,6 +7,9 @@ sdk: gradio
7
  sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
10
+ preload_from_hub:
11
+ - "microsoft/phi-2"
12
+ - "BAAI/bge-small-en-v1.5"
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -2,6 +2,7 @@
2
  # TODO: question samples
3
  # TEST: with and without GPU instance
4
  # TODO: visual questions on page image (in same app)?
 
5
 
6
  import torch
7
  from llama_index.llms.huggingface import HuggingFaceLLM
@@ -14,8 +15,20 @@ from PIL import Image
14
 
15
  import gradio as gr
16
 
17
- CHEAPMODE=True
18
- LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  def messages_to_prompt(messages):
@@ -39,7 +52,7 @@ def messages_to_prompt(messages):
39
  return prompt
40
 
41
 
42
- def load_RAG_pipeline():
43
  # LLM
44
  quantization_config = None # dirty fix for CPU/GPU support
45
  if torch.cuda.is_available():
@@ -53,45 +66,57 @@ def load_RAG_pipeline():
53
  )
54
 
55
  llm = HuggingFaceLLM(
56
- model_name=LLM,
57
- tokenizer_name=LLM,
58
  query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
59
- context_window=3900,
60
- max_new_tokens=256,
61
  model_kwargs={"quantization_config": quantization_config},
62
  # tokenizer_kwargs={},
63
- generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
64
  messages_to_prompt=messages_to_prompt,
65
  device_map="auto",
66
  )
67
 
68
  # Llama-index
69
  Settings.llm = llm
70
- Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
71
  # Settings.chunk_size = 512
72
  # Settings.chunk_overlap = 50
73
 
74
  # raw data
75
  documents = SimpleDirectoryReader("assets/txts").load_data()
76
  vector_index = VectorStoreIndex.from_documents(documents)
 
77
  # vector_index.persist(persist_dir="vectors")
78
  # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
79
 
80
  # summary_index = SummaryIndex.from_documents(documents)
81
- query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=3)
82
  return query_engine
83
 
84
 
85
- query_engine = load_RAG_pipeline()
86
 
87
 
88
  # These are placeholder functions to simulate the behavior of the RAG setup.
89
  # You would need to implement these with the actual logic to retrieve and generate answers based on the document.
90
- def get_answer(question, temperature, nucleus_sampling, max_tokens):
91
  # Here you should implement the logic to generate an answer based on the question and the document.
92
  # For example, you could use a machine learning model for RAG.
93
  # answer = "This is a placeholder answer."
94
  # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
 
 
 
 
 
 
 
 
 
 
 
95
  response = query_engine.query(question)
96
  return response
97
 
@@ -107,6 +132,7 @@ def get_answer_page(response):
107
 
108
  # Create the gr.Interface function
109
  def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
 
110
  answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
111
  image, answer_page = get_answer_page(answer)
112
  return answer, image, answer_page
@@ -122,15 +148,16 @@ iface = gr.Interface(
122
  inputs=[
123
  gr.Textbox(label="Question", placeholder="Type your question here..."),
124
  gr.Slider(0, 1, value=0.7, label="Temperature"),
125
- gr.Slider(0, 1, value=0.9, label="Nucleus Sampling"),
126
- gr.Slider(1, 500, value=100, label="Max Generated Number of Tokens"),
127
  ],
128
  outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
129
  title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
130
- description=r"""Chat with the thesis manuscript: ask questions and receive answers with multimodal references (WIP).
131
 
132
- Spoiler: RAG application with LLM and embedding vector store can be quite slow on a 290 page document ;D
133
  """,
 
134
  allow_flagging="never",
135
  )
136
  # https://github.com/gradio-app/gradio/issues/4309
 
2
  # TODO: question samples
3
  # TEST: with and without GPU instance
4
  # TODO: visual questions on page image (in same app)?
5
+ ## locally check timings of start-up code and see if I cannot pass the parameters to creating vector engine
6
 
7
  import torch
8
  from llama_index.llms.huggingface import HuggingFaceLLM
 
15
 
16
  import gradio as gr
17
 
18
+ CHEAPMODE = torch.cuda.is_available()
19
+
20
+ # LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
21
+
22
+ config = {
23
+ "LLM": "microsoft/phi-2",
24
+ "embeddings": "BAAI/bge-small-en-v1.5",
25
+ "similarity_top_k": 2,
26
+ "context_window": 2048,
27
+ "max_new_tokens": 150,
28
+ "temperature": 0.7,
29
+ "top_k": 5,
30
+ "top_p": 0.95,
31
+ }
32
 
33
 
34
  def messages_to_prompt(messages):
 
52
  return prompt
53
 
54
 
55
+ def load_RAG_pipeline(config):
56
  # LLM
57
  quantization_config = None # dirty fix for CPU/GPU support
58
  if torch.cuda.is_available():
 
66
  )
67
 
68
  llm = HuggingFaceLLM(
69
+ model_name=config["LLM"],
70
+ tokenizer_name=config["LLM"],
71
  query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
72
+ context_window=config["context_window"],
73
+ max_new_tokens=config["max_new_tokens"],
74
  model_kwargs={"quantization_config": quantization_config},
75
  # tokenizer_kwargs={},
76
+ generate_kwargs={"temperature": config["temperature"], "top_k": config["top_k"], "top_p": config["top_p"]},
77
  messages_to_prompt=messages_to_prompt,
78
  device_map="auto",
79
  )
80
 
81
  # Llama-index
82
  Settings.llm = llm
83
+ Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
84
  # Settings.chunk_size = 512
85
  # Settings.chunk_overlap = 50
86
 
87
  # raw data
88
  documents = SimpleDirectoryReader("assets/txts").load_data()
89
  vector_index = VectorStoreIndex.from_documents(documents)
90
+
91
  # vector_index.persist(persist_dir="vectors")
92
  # https://docs.llamaindex.ai/en/v0.10.17/understanding/storing/storing.html
93
 
94
  # summary_index = SummaryIndex.from_documents(documents)
95
+ query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=config["similarity_top_k"])
96
  return query_engine
97
 
98
 
99
+ default_query_engine = load_RAG_pipeline(config)
100
 
101
 
102
  # These are placeholder functions to simulate the behavior of the RAG setup.
103
  # You would need to implement these with the actual logic to retrieve and generate answers based on the document.
104
+ def get_answer(question, temperature, nucleus_sampling, max_tokens, query_engine=default_query_engine):
105
  # Here you should implement the logic to generate an answer based on the question and the document.
106
  # For example, you could use a machine learning model for RAG.
107
  # answer = "This is a placeholder answer."
108
  # https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
109
+
110
+ # if temperature or nucleus sampling or max_tokens != as in config, recall query engine
111
+ if (
112
+ temperature != config["temperature"]
113
+ or nucleus_sampling != config["top_p"]
114
+ or max_tokens != config["max_new_tokens"]
115
+ ):
116
+ config["temperature"] = temperature
117
+ config["top_p"] = nucleus_sampling
118
+ config["max_new_tokens"] = max_tokens
119
+ query_engine = load_RAG_pipeline(config)
120
  response = query_engine.query(question)
121
  return response
122
 
 
132
 
133
  # Create the gr.Interface function
134
  def ask_my_thesis(question, temperature, nucleus_sampling, max_tokens):
135
+ print(f"Got Q: {question}")
136
  answer = get_answer(question, temperature, nucleus_sampling, max_tokens)
137
  image, answer_page = get_answer_page(answer)
138
  return answer, image, answer_page
 
148
  inputs=[
149
  gr.Textbox(label="Question", placeholder="Type your question here..."),
150
  gr.Slider(0, 1, value=0.7, label="Temperature"),
151
+ gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
152
+ gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
153
  ],
154
  outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
155
  title="Ask my thesis: Intelligent Automation for AI-Driven Document Understanding",
156
+ description=r"""Chat with the thesis manuscript by asking questions and receive answers with multimodal references.
157
 
158
+ Spoiler: a RAG application with a >1B LLM and vector store can be quite slow on a 290 page document :hourglass:
159
  """,
160
+ css="body { background-image: url('https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png'); }",
161
  allow_flagging="never",
162
  )
163
  # https://github.com/gradio-app/gradio/issues/4309