File size: 2,377 Bytes
f390c59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import PyPDF2
import json
import networkx as nx
from sentence_transformers import SentenceTransformer, util
import openai

# Model for embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

# 1. Extract Text from PDF
def extract_text_from_pdf(file_path):
    """Extract text from a PDF."""
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

# 2. Build Hierarchical Tree
def build_hierarchical_tree(text, textbook_title):
    """Create a hierarchical tree structure."""
    lines = text.split("\n")
    tree = {"title": textbook_title, "chapters": []}
    current_chapter = None

    for line in lines:
        if line.strip().startswith("Chapter"):
            current_chapter = {"title": line.strip(), "sections": []}
            tree["chapters"].append(current_chapter)
        elif current_chapter and line.strip():
            current_chapter["sections"].append(line.strip())
    return tree

def save_tree(tree, path):
    """Save the hierarchical tree."""
    with open(path, "w") as f:
        json.dump(tree, f, indent=4)

# 3. Hybrid Retrieval
def hybrid_retrieval(query, openai_api_key):
    """Retrieve relevant text using hybrid methods."""
    with open("hierarchical_trees/example_tree.json") as f:  # Adjust file path as needed
        tree = json.load(f)
    
    all_sections = [
        section for chapter in tree["chapters"] for section in chapter["sections"]
    ]
    query_embedding = model.encode(query, convert_to_tensor=True)
    section_embeddings = model.encode(all_sections, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, section_embeddings)
    
    top_indices = similarities[0].topk(3).indices.tolist()
    return " ".join([all_sections[i] for i in top_indices])

# 4. RAG Answer Generation
def rag_answer(query, context, openai_api_key):
    """Generate an answer using Retrieval-Augmented Generation."""
    openai.api_key = openai_api_key
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:",
        max_tokens=150,
    )
    return response.choices[0].text.strip()