IntelliLearn / utils.py
Narayana02's picture
Rename utilities.py to utils.py
6f169ed verified
raw
history blame
2.38 kB
import PyPDF2
import json
import networkx as nx
from sentence_transformers import SentenceTransformer, util
import openai
# Model for embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
# 1. Extract Text from PDF
def extract_text_from_pdf(file_path):
"""Extract text from a PDF."""
text = ""
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text += page.extract_text()
return text
# 2. Build Hierarchical Tree
def build_hierarchical_tree(text, textbook_title):
"""Create a hierarchical tree structure."""
lines = text.split("\n")
tree = {"title": textbook_title, "chapters": []}
current_chapter = None
for line in lines:
if line.strip().startswith("Chapter"):
current_chapter = {"title": line.strip(), "sections": []}
tree["chapters"].append(current_chapter)
elif current_chapter and line.strip():
current_chapter["sections"].append(line.strip())
return tree
def save_tree(tree, path):
"""Save the hierarchical tree."""
with open(path, "w") as f:
json.dump(tree, f, indent=4)
# 3. Hybrid Retrieval
def hybrid_retrieval(query, openai_api_key):
"""Retrieve relevant text using hybrid methods."""
with open("hierarchical_trees/example_tree.json") as f: # Adjust file path as needed
tree = json.load(f)
all_sections = [
section for chapter in tree["chapters"] for section in chapter["sections"]
]
query_embedding = model.encode(query, convert_to_tensor=True)
section_embeddings = model.encode(all_sections, convert_to_tensor=True)
similarities = util.pytorch_cos_sim(query_embedding, section_embeddings)
top_indices = similarities[0].topk(3).indices.tolist()
return " ".join([all_sections[i] for i in top_indices])
# 4. RAG Answer Generation
def rag_answer(query, context, openai_api_key):
"""Generate an answer using Retrieval-Augmented Generation."""
openai.api_key = openai_api_key
response = openai.Completion.create(
engine="text-davinci-003",
prompt=f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:",
max_tokens=150,
)
return response.choices[0].text.strip()