File size: 3,516 Bytes
b35a32c
 
 
 
 
 
 
 
a1e6c4a
 
 
b35a32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1e6c4a
b35a32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1e6c4a
 
 
 
 
b35a32c
a1e6c4a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from llama_cpp import Llama

def load_llm_model():
    try:
        llm = Llama(
            model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
            n_gpu_layers = -1,
            n_ctx=100000,
            n_batch=4096,
        )
        print("LLM model loaded successfully")
        return llm
    except Exception as e:
        print(f"Error loading LLM model: {e}")
        raise

def get_text_from_pdf(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    final_text = ""
    for text in texts:
        if text.page_content.startswith("REFERENCES"):
            break
        else:
            final_text = final_text + text.page_content
    research_paper = ""
    for text in final_text:
        if text.startswith(("REFERENCES", "REFERENCESREFERENCES", "REFERENCESREFERENCESREFERENCES")):
            break
        else:
            research_paper = research_paper + text
    return research_paper[:10000]

def generate_prompt(research_paper):
    prompt = f'''

    As a text script expert, please help me to write a short text script with the topic \\"{research_paper}\\".Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown.\\n Do not include anything in the response, that is not the part of mindmap.\\n  Importantly your output must use language \\"English\\""

    '''
    return prompt

def generate_mindmap_structure(llm, prompt):
    response = llm.create_chat_completion(
        messages = [
            {'role':'system',
            'content': 'You are a helpful research assistant for generating well-formatted mindmaps in MarkDown format from scientific research papers.'},
            {'role':'user',
            'content': prompt}
        ],
        temperature=0.7,
        top_k=200,
        top_p=3.0,
    )
    mindmap_data = response['choices'][0]['message']['content']
    return mindmap_data

def generate_markdown(llm, file):
    final_text = get_text_from_pdf(file)
    prompt = generate_prompt(final_text)
    mindmap_markdown = generate_mindmap_structure(llm, prompt)
    if "**" in mindmap_markdown:
        mindmap_markdown = mindmap_markdown.replace("- **", "### ")
        mindmap_markdown = mindmap_markdown.replace("**", "")
    else:
        pass
    return mindmap_markdown

def sanitize_markdown(llm, mindmap_markdown):
    prompt = f'''

    As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."

    '''
    sanitized_markdown = generate_mindmap_structure(llm, prompt)
    return sanitized_markdown