Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import json | |
import networkx as nx | |
from sentence_transformers import SentenceTransformer, util | |
import openai | |
# Model for embeddings | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
# 1. Extract Text from PDF | |
def extract_text_from_pdf(file_path): | |
"""Extract text from a PDF.""" | |
text = "" | |
with open(file_path, "rb") as f: | |
reader = PyPDF2.PdfReader(f) | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
# 2. Build Hierarchical Tree | |
def build_hierarchical_tree(text, textbook_title): | |
"""Create a hierarchical tree structure.""" | |
lines = text.split("\n") | |
tree = {"title": textbook_title, "chapters": []} | |
current_chapter = None | |
for line in lines: | |
if line.strip().startswith("Chapter"): | |
current_chapter = {"title": line.strip(), "sections": []} | |
tree["chapters"].append(current_chapter) | |
elif current_chapter and line.strip(): | |
current_chapter["sections"].append(line.strip()) | |
return tree | |
def save_tree(tree, path): | |
"""Save the hierarchical tree.""" | |
with open(path, "w") as f: | |
json.dump(tree, f, indent=4) | |
# 3. Hybrid Retrieval | |
def hybrid_retrieval(query, openai_api_key): | |
"""Retrieve relevant text using hybrid methods.""" | |
with open("hierarchical_trees/example_tree.json") as f: # Adjust file path as needed | |
tree = json.load(f) | |
all_sections = [ | |
section for chapter in tree["chapters"] for section in chapter["sections"] | |
] | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
section_embeddings = model.encode(all_sections, convert_to_tensor=True) | |
similarities = util.pytorch_cos_sim(query_embedding, section_embeddings) | |
top_indices = similarities[0].topk(3).indices.tolist() | |
return " ".join([all_sections[i] for i in top_indices]) | |
# 4. RAG Answer Generation | |
def rag_answer(query, context, openai_api_key): | |
"""Generate an answer using Retrieval-Augmented Generation.""" | |
openai.api_key = openai_api_key | |
response = openai.Completion.create( | |
engine="text-davinci-003", | |
prompt=f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:", | |
max_tokens=150, | |
) | |
return response.choices[0].text.strip() | |