FridayMaster commited on
Commit
3d00632
·
verified ·
1 Parent(s): b4ca8bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -21
app.py CHANGED
@@ -1,32 +1,78 @@
1
-
2
- import gradio as gr
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
4
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Load the model and tokenizer
7
- model_name = 'FridayMaster/fine_tune_embedding'
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- model = AutoModelForCausalLM.from_pretrained(model_name) # Use the appropriate class
10
-
11
- # Define a function to generate responses
12
- def generate_response(prompt):
13
- # Tokenize the input prompt
14
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
15
- with torch.no_grad():
16
- # Generate a response using the model
17
- outputs = model.generate(inputs['input_ids'], max_length=150, num_return_sequences=1)
18
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  return response
20
 
21
- # Create a Gradio interface
22
  iface = gr.Interface(
23
  fn=generate_response,
24
- inputs=gr.Textbox(label="Enter your message", placeholder="Type something here..."),
25
  outputs=gr.Textbox(label="Response"),
26
- title="Chatbot Interface",
27
- description="Interact with the fine-tuned chatbot model."
28
  )
29
 
30
- # Launch the Gradio app
31
  if __name__ == "__main__":
32
  iface.launch()
 
 
1
+ import pandas as pd
2
+ import fitz # PyMuPDF for PDF extraction
3
+ import spacy
4
+ from langchain.vectorstores import FAISS
5
  import torch
6
+ from transformers import AutoTokenizer, AutoModel
7
+ import gradio as gr
8
+
9
+ # Load and preprocess PDF text
10
+ def extract_text_from_pdf(pdf_path):
11
+ text = ""
12
+ with fitz.open(pdf_path) as pdf_document:
13
+ for page_num in range(len(pdf_document)):
14
+ page = pdf_document.load_page(page_num)
15
+ text += page.get_text()
16
+ return text
17
+
18
+ # Extract text from the PDF
19
+ pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf') # Replace with your PDF path
20
+
21
+ # Convert the text to a DataFrame
22
+ df = pd.DataFrame({'text': [pdf_text]})
23
+
24
+ # Define your custom embedding model
25
+ class CustomEmbeddingModel:
26
+ def __init__(self, model_name):
27
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+ self.model = AutoModel.from_pretrained(model_name)
29
+
30
+ def embed_text(self, text):
31
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
32
+ with torch.no_grad():
33
+ embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
34
+ return embeddings[0].numpy()
35
 
36
+ embedding_model = CustomEmbeddingModel('distilbert-base-uncased') # Replace with your model name
37
+
38
+ # Load Spacy model for preprocessing
39
+ nlp = spacy.load("en_core_web_sm")
40
+
41
+ def preprocess_text(text):
42
+ doc = nlp(text)
43
+ tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english') and token.is_alpha]
44
+ return ' '.join(tokens)
45
+
46
+ # Apply preprocessing and embedding
47
+ df['text'] = df['text'].apply(preprocess_text)
48
+ df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
49
+
50
+ # Create FAISS vector store
51
+ documents = df['text'].tolist()
52
+ embeddings = df['text_embeddings'].tolist()
53
+ vector_store = FAISS.from_documents(documents, embeddings)
54
+
55
+ # Function to generate a response
56
+ def generate_response(query):
57
+ preprocessed_query = preprocess_text(query)
58
+ query_embedding = embedding_model.embed_text(preprocessed_query)
59
+ # Find the closest document in the vector store
60
+ distances, indices = vector_store.search(query_embedding, k=1) # k=1 for the closest document
61
+ if indices:
62
+ response = documents[indices[0]]
63
+ else:
64
+ response = "No relevant information found."
65
  return response
66
 
67
+ # Gradio interface
68
  iface = gr.Interface(
69
  fn=generate_response,
70
+ inputs=gr.Textbox(label="Enter your query", placeholder="Ask about Ubuntu..."),
71
  outputs=gr.Textbox(label="Response"),
72
+ title="Ubuntu Manual Chatbot",
73
+ description="Ask questions about the Ubuntu manual."
74
  )
75
 
 
76
  if __name__ == "__main__":
77
  iface.launch()
78
+