vinmay1234 commited on
Commit
94ffe87
1 Parent(s): 5dad372

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -0
app.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !pip install datasets transformers accelerate peft bitsandbytes
3
+ from datasets import load_dataset
4
+
5
+ # Load 70% of the Wikipedia dataset
6
+ # dataset = load_dataset('wikimedia/wikipedia', "20231101.en", split='train[:70%]')
7
+
8
+ dataset = load_dataset('lucadiliello/wikipedia_512_pretraining',split = 'train[:70%]')
9
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
10
+
11
+ # # Define the quantization configuration for 4-bit
12
+ # quantization_config = BitsAndBytesConfig(
13
+ # load_in_4bit=True, # Enable 4-bit precision
14
+ # bnb_4bit_quant_type="nf4", # Use the NF4 quantization type (good for reducing memory)
15
+ # bnb_4bit_use_double_quant=True, # Enables double quantization to improve accuracy
16
+ # bnb_4bit_compute_dtype="float16" # Use float16 for faster computation
17
+ # )
18
+
19
+ # # Load the tokenizer
20
+ # tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
21
+
22
+ # # Load the model with the quantization configuration
23
+ # model = AutoModelForCausalLM.from_pretrained(
24
+ # 'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
25
+ # quantization_config=quantization_config, # Apply the 4-bit quantization config
26
+ # device_map='auto' # Automatically map model to available devices (e.g., GPU/CPU)
27
+ # )
28
+
29
+ # # Enable gradient checkpointing to reduce memory usage during training
30
+ # model.gradient_checkpointing_enable()
31
+
32
+
33
+
34
+ ########################################################### gpt2 ####################################################
35
+
36
+
37
+
38
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
39
+
40
+ # Define the quantization configuration for 4-bit
41
+ quantization_config = BitsAndBytesConfig(
42
+ load_in_4bit=True, # Enable 4-bit precision
43
+ bnb_4bit_quant_type="nf4", # Use the NF4 quantization type (good for reducing memory)
44
+ bnb_4bit_use_double_quant=True, # Enables double quantization to improve accuracy
45
+ bnb_4bit_compute_dtype="float16" # Use float16 for faster computation
46
+ )
47
+
48
+ # Load the tokenizer
49
+ tokenizer = AutoTokenizer.from_pretrained('gpt2')
50
+
51
+ # Load the model with the quantization configuration
52
+ model = AutoModelForCausalLM.from_pretrained(
53
+ 'gpt2',
54
+ quantization_config=quantization_config, # Apply the 4-bit quantization config
55
+ device_map='auto' # Automatically map model to available devices (e.g., GPU/CPU)
56
+ )
57
+
58
+ # Enable gradient checkpointing to reduce memory usage during training
59
+ model.gradient_checkpointing_enable()
60
+
61
+
62
+
63
+ from peft import LoraConfig, get_peft_model
64
+ import bitsandbytes as bnb
65
+
66
+ # Configure PEFT with 4-bit precision
67
+ # lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
68
+ lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["attn.c_attn", "mlp.c_fc", "mlp.c_proj"], lora_dropout=0.05, bias="none")
69
+ peft_model = get_peft_model(model, lora_config)
70
+
71
+
72
+ # Set the pad token (using eos_token or adding a new special token)
73
+ if tokenizer.pad_token is None:
74
+ # Option 1: Use eos_token as pad_token
75
+ tokenizer.pad_token = tokenizer.eos_token
76
+
77
+ # Option 2: Add [PAD] as a new pad token if needed
78
+ # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
79
+
80
+ # Tokenize the dataset with optimized settings
81
+ def tokenize_function(examples):
82
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=150)
83
+
84
+ tokenized_dataset = dataset.select(range(100000)).map(tokenize_function, batched=True)
85
+ def prepare_labels(batch):
86
+ batch["labels"] = batch["input_ids"].copy() # Copy input_ids as labels for language modeling
87
+ return batch
88
+
89
+ # Apply the transformation to add labels
90
+ tokenized_dataset = tokenized_dataset.map(prepare_labels, batched=True)
91
+ # Step 1: Install FAISS for the Vector Database
92
+ !pip install faiss-cpu
93
+ !pip install faiss-gpu
94
+ !pip install sentence_transformersimport torch
95
+ from datasets import Dataset
96
+ from transformers import AutoModel, AutoTokenizer
97
+ import faiss
98
+ import numpy as np
99
+ from tqdm import tqdm # Import tqdm for progress bar
100
+
101
+ # Load your tokenizer and model
102
+ embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
103
+ embedding_model = AutoModel.from_pretrained(embedding_model_name)
104
+ embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
105
+
106
+ # Move the model to GPU if available
107
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
108
+ embedding_model.to(device)
109
+
110
+ # Function to generate embeddings in batches
111
+ def embed_text_batch(texts, batch_size=16):
112
+ all_embeddings = []
113
+
114
+ for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
115
+ batch_texts = texts[i:i + batch_size]
116
+
117
+ # Tokenize and move inputs to the GPU
118
+ inputs = embedding_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
119
+
120
+ with torch.no_grad():
121
+ # Generate embeddings and move them back to CPU
122
+ embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy() # Mean pooling
123
+
124
+ all_embeddings.extend(embeddings)
125
+
126
+ return np.array(all_embeddings)
127
+
128
+ # Step 1: Process the dataset in batches
129
+ texts = tokenized_dataset["text"]
130
+ batch_size = 16 # Adjust based on Colab memory
131
+ embeddings = embed_text_batch(texts, batch_size=batch_size)
132
+
133
+ # Step 2: Add embeddings as a new column to the dataset
134
+ tokenized_dataset = tokenized_dataset.add_column("embeddings", embeddings.tolist())
135
+
136
+ # Step 3: Add FAISS index
137
+ dimension = embeddings.shape[1] # Dimension of embeddings
138
+ faiss_index = faiss.IndexFlatL2(dimension)
139
+
140
+ # Step 4: Add embeddings to FAISS index
141
+ faiss_index.add(embeddings)
142
+
143
+ # Step 5: Save the dataset and FAISS index
144
+ tokenized_dataset.save_to_disk("wikipedia_dataset_with_embeddings")
145
+ faiss.write_index(faiss_index, "wikipedia_faiss.index")
146
+
147
+ print("FAISS index and dataset saved successfully.")
148
+ def embed_query(query):
149
+ # Tokenize and embed the query
150
+ inputs = embedding_tokenizer([query], padding=True, truncation=True, return_tensors="pt").to(device)
151
+
152
+ with torch.no_grad():
153
+ query_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
154
+
155
+ return query_embedding
156
+ def search_faiss(query_embedding, faiss_index, top_k=5):
157
+ # Search the FAISS index
158
+ distances, indices = faiss_index.search(query_embedding, top_k)
159
+
160
+ return distances, indices
161
+ def get_top_answer(indices, dataset):
162
+ # Retrieve the top answer(s) from the dataset based on the indices
163
+ return dataset["text"][indices[0][0]] # Assuming top result, can adjust for more answers
164
+ import torch
165
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
166
+ import faiss
167
+ import numpy as np
168
+
169
+ # Assuming embeddings and faiss_index are already created as in your previous code
170
+
171
+ # Load the pre-trained LLM for generation (you can replace it with a different one)
172
+ llm_model_name = "facebook/bart-large-cnn" # Example: You can use GPT-3, BART, T5, etc.
173
+ llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)
174
+ llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
175
+
176
+ # Move model to GPU if available
177
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
178
+ llm_model.to(device)
179
+
180
+ # Embedding model used for creating the vector database (same as the one used to generate embeddings for dataset)
181
+ embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
182
+ embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
183
+ embedding_model = AutoModel.from_pretrained(embedding_model_name)
184
+ embedding_model.to(device)
185
+
186
+ # Function to embed a query (same as before)
187
+ def embed_query(query):
188
+ inputs = embedding_tokenizer([query], padding=True, truncation=True, return_tensors="pt").to(device)
189
+ with torch.no_grad():
190
+ query_embedding = embedding_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
191
+ return query_embedding
192
+
193
+ # Function to search FAISS index and retrieve top k results
194
+ def search_faiss(query_embedding, faiss_index, top_k=5):
195
+ distances, indices = faiss_index.search(query_embedding, top_k)
196
+ return distances, indices
197
+
198
+ # Function to generate an answer using the LLM based on the retrieved documents
199
+ def generate_answer(query, retrieved_texts):
200
+ # Combine the query and the retrieved texts into a single input
201
+ context = " ".join(retrieved_texts)
202
+ input_text = f"Question: {query}\nContext: {context}\nAnswer:"
203
+
204
+ # Tokenize and pass to the LLM
205
+ inputs = llm_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
206
+ with torch.no_grad():
207
+ generated_ids = llm_model.generate(inputs['input_ids'], max_length=150)
208
+
209
+ # Decode the generated response
210
+ answer = llm_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
211
+ return answer
212
+
213
+ # Function to retrieve the texts from the dataset based on FAISS index results
214
+ def get_retrieved_texts(indices, dataset, top_k=5):
215
+ retrieved_texts = []
216
+ for idx in indices[0][:top_k]: # Get the top K results
217
+ retrieved_texts.append(dataset['text'][idx]) # Assuming 'text' is the relevant field in the dataset
218
+ return retrieved_texts
219
+
220
+ # Example usage
221
+ def rag_pipeline(question, faiss_index, dataset, top_k=3):
222
+ # Step 1: Embed the query
223
+ query_embedding = embed_query(question)
224
+
225
+ # Step 2: Search the FAISS index for the top K similar documents
226
+ distances, indices = search_faiss(query_embedding, faiss_index, top_k=top_k)
227
+
228
+ # Step 3: Retrieve the top K relevant documents from the dataset
229
+ retrieved_texts = get_retrieved_texts(indices, dataset, top_k=top_k)
230
+
231
+ # Step 4: Generate the answer using the retrieved texts and the LLM
232
+ answer = generate_answer(question, retrieved_texts)
233
+
234
+ return answer
235
+ !pip install ollama
236
+ !ollama pull llama2
237
+ # Import the necessary modules
238
+ from langchain_community.llms import Ollama
239
+
240
+ # Load the Ollama model
241
+ gen_model = Ollama(model="llama2")
242
+
243
+ # Define a function to get predefined responses for specific queries
244
+ def get_predefined_response(question):
245
+ predefined_responses = {
246
+ "hi": "Hello! How can I assist you today?",
247
+ "hello": "Hi there! 😊 What can I help you with?",
248
+ "who made you?": "I was created by Vinmay and his team.",
249
+ "what is your purpose?": "I'm here to assist you with educational queries and provide information.",
250
+ # Add more predefined responses as needed
251
+ }
252
+
253
+ # Normalize the question to make it case insensitive
254
+ normalized_question = question.lower()
255
+
256
+ return predefined_responses.get(normalized_question, None)
257
+
258
+ # Modify the generate_response function to check for predefined responses
259
+ def generate_response(markdown, question, user_instructions=None, max_new_tokens=250, temperature=0.9, top_p=0.95):
260
+ # Check for predefined response first
261
+ predefined_response = get_predefined_response(question)
262
+ if predefined_response:
263
+ return predefined_response
264
+
265
+ instruction_text = f" Please follow these instructions: {user_instructions}" if user_instructions else ""
266
+
267
+ prompt = (
268
+ f"Using the provided context, please generate a unique and insightful answer that directly addresses the question:\n\n"
269
+ f"Context:\n{markdown}\n\n"
270
+ f"Question: {question}\n"
271
+ f"{instruction_text}\n"
272
+ f"If any personal query asked then refer{predefined_response}\n and based upon it, genarate your own answer"
273
+ f"Please synthesize your response by integrating the information with your own understanding: "
274
+ )
275
+
276
+ # Call the Ollama model using the `invoke` method
277
+ response = gen_model.invoke(prompt, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p)
278
+
279
+ # Check if the response is a string (direct generated text) or a dictionary (with metadata)
280
+ if isinstance(response, str):
281
+ return response # Return the raw text if it's a string
282
+ elif isinstance(response, dict) and "choices" in response:
283
+ return response["choices"][0]["text"] # Extract the text from the structured response
284
+ else:
285
+ return "Unexpected response format."
286
+
287
+ # # Example usage
288
+ # markdown = "The sky appears blue due to the scattering of light by the atmosphere."
289
+ # question = "Hi"
290
+ # response = generate_response(markdown, question)
291
+
292
+ # print(f"Model Response: {response}")
293
+
294
+ import gradio as gr
295
+ from langchain_community.llms import Ollama
296
+
297
+ # Load the Ollama model
298
+ gen_model = Ollama(model="llama2")
299
+
300
+ # Define the manual responses
301
+ manual_responses = {
302
+ "hi": "Hello! How can I assist you today?",
303
+ "hello": "Hi there! What would you like to know?",
304
+ "who made you?": "I was created by OpenAI.",
305
+ "what is your purpose?": "I'm here to assist with educational queries!"
306
+ }
307
+
308
+ # Function to generate responses
309
+ def generate_response(user_input):
310
+ # Normalize user input for matching
311
+ normalized_input = user_input.lower().strip()
312
+
313
+ # Check for manual responses
314
+ if normalized_input in manual_responses:
315
+ return manual_responses[normalized_input]
316
+
317
+ # For other questions, generate a response using the model
318
+ prompt = f"Please provide a detailed answer to the following question:\n\nQuestion: {user_input}\n"
319
+
320
+ response = gen_model.invoke(prompt)
321
+ return response.strip()
322
+
323
+ # Create the Gradio interface
324
+ iface = gr.Interface(
325
+ fn=generate_response,
326
+ inputs=gr.Textbox(label="Ask a Question"),
327
+ outputs=gr.Textbox(label="Response"),
328
+ title="Q&A System",
329
+ description="Ask me anything and I will respond accordingly."
330
+ )
331
+
332
+ # Launch the Gradio app
333
+ if __name__ == "__main__":
334
+ iface.launch(share=True, inline = False) # Use share=True to make it public if needed