raannakasturi commited on
Commit
1641226
1 Parent(s): e64a083

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +69 -0
  2. generate_markdown.py +95 -0
  3. generate_mindmap.py +108 -0
  4. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from generate_markdown import load_llm_model, generate_markdown
4
+ from generate_mindmap import generate_mindmap_svg
5
+ import gradio as gr
6
+ import subprocess
7
+
8
+ llm = load_llm_model()
9
+
10
+ def generate(file):
11
+ summary = "This is a summary of the research paper"
12
+ mindmap_markdown = generate_markdown(llm, file)
13
+ mindmap_svg = generate_mindmap_svg(mindmap_markdown)
14
+ return summary, mindmap_markdown, mindmap_svg
15
+
16
+ theme = gr.themes.Soft(
17
+ primary_hue="purple",
18
+ secondary_hue="cyan",
19
+ neutral_hue="slate",
20
+ font=[gr.themes.GoogleFont('Syne'), gr.themes.GoogleFont('poppins'), gr.themes.GoogleFont('poppins'), gr.themes.GoogleFont('poppins')],
21
+ )
22
+
23
+ with gr.Blocks(theme=theme, title="Binary Biology") as app:
24
+ file = gr.File(file_count='single', label='Upload Research Paper PDF file')
25
+ summary = gr.TextArea(label='Summary', lines=5, interactive=False, show_copy_button=True)
26
+ markdown_mindmap = gr.Textbox(label='Mindmap', lines=5, interactive=False, show_copy_button=True)
27
+ graphical_mindmap = gr.Image(label='Graphical Mindmap', interactive=False, show_download_button=True)
28
+ submit = gr.Button(value='Submit')
29
+
30
+ submit.click(generate,
31
+ inputs=[file],
32
+ outputs=[summary, markdown_mindmap, graphical_mindmap],
33
+ scroll_to_output=True,
34
+ show_progress=True,
35
+ queue=True,
36
+ )
37
+
38
+ if __name__ == "__main__":
39
+ # Download models
40
+ subprocess.run(['wget', '-o', 'model.gguf', 'https://huggingface.co/featherless-ai-quants/amazon-MegaBeam-Mistral-7B-300k-GGUF/resolve/main/amazon-MegaBeam-Mistral-7B-300k-Q6_K.gguf?download=true'])
41
+ # run entire command: "CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python"
42
+ try:
43
+ env = os.environ.copy()
44
+ env["CMAKE_ARGS"] = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
45
+ cmd = ["pip", "install", "llama-cpp-python"]
46
+ subprocess.run(cmd, env=env)
47
+ except:
48
+ cmd = ["pip", "install", "llama-cpp-python"]
49
+ subprocess.run(cmd)
50
+ try:
51
+ try:
52
+ subprocess.run(['apt', 'install', '-y', 'graphviz'])
53
+ print("Graphviz installed successfully")
54
+ except:
55
+ subprocess.run(['sudo', 'apt', 'install', '-y', 'graphviz'])
56
+ print("Graphviz installed successfully")
57
+ except:
58
+ print("Graphviz installation failed")
59
+ sys.exit(1)
60
+ print("Model loaded successfully")
61
+ llm = load_llm_model()
62
+ print("Model loaded successfully")
63
+ app.queue(default_concurrency_limit=5).launch(show_error=True)
64
+ # summary, markdown_mindmap, graphical_mindmap = generate("cr1c00107.pdf")
65
+ # print(summary)
66
+ # print("\n\n")
67
+ # print(markdown_mindmap)
68
+ # print("\n\n")
69
+ # print(graphical_mindmap)
generate_markdown.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from llama_cpp import Llama
4
+ import llama_cpp
5
+
6
+ def load_llm_model():
7
+ llm = Llama(
8
+ model_path="Llama-3.2-3B-Instruct-Q8_0.gguf",
9
+ # n_gpu_layers = 20, # Uncomment for GPU
10
+ n_ctx=200000,
11
+ n_threads=16,
12
+ n_batch=512,
13
+ split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
14
+ pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK,
15
+ rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_LINEAR,
16
+ # main_gpu=0 # Uncomment for GPU
17
+ )
18
+ return llm
19
+
20
+ def get_text_from_pdf(file):
21
+ loader = PyPDFLoader(file)
22
+ pages = loader.load_and_split()
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
24
+ texts = text_splitter.split_documents(pages)
25
+ final_text = ""
26
+ for text in texts:
27
+ final_text = final_text + text.page_content
28
+ print(f"Length of final text: {len(final_text)}")
29
+ with open("final_text.txt", "w") as f:
30
+ f.write(final_text)
31
+ research_paper = ""
32
+ for line in final_text.split("\n"):
33
+ if line.startswith("REFERENCES"):
34
+ break
35
+ else:
36
+ research_paper = research_paper + line + " "
37
+ with open("research_paper.txt", "w") as f:
38
+ f.write(research_paper)
39
+ print(f"Length of research paper: {len(research_paper)}")
40
+ return research_paper
41
+
42
+ def generate_prompt(final_text):
43
+ prompt = f'''
44
+ You have been provided with a research paper in text format. Your task is to generate a mindmap structure in markdown format that summarizes the research paper.
45
+ Your output should use the language \"en\" 0.3 times the length of the original research paper. Do not include anything in the response, that is not the part of mindmap and use the following template (any node in the mindmap should not exceed 10-12 words, also generate additional headings that aren't present in document if required for elaborative explaination):
46
+ # {{Title}} (should be the title of the research paper)
47
+ ## {{Subtitle01}} (as required and as many as required in markdown format)
48
+ - {{Emoji01}} Bulletpoint01 (as required and as many as required in markdown format)
49
+ - {{Emoji01.1}} Bulletpoint01.1 (as required and as many as sub levels required in markdown format)
50
+ - {{Emoji01.1.1}} Bulletpoint01.1.1 (as required and as many as sub levels required in markdown format)
51
+ - {{Emoji01.1.2}} Bulletpoint01.1.2 (as required and as many as sub levels required in markdown format)
52
+ - {{Emoji01.2}} Bulletpoint01.2 (as required and as many as sub levels required in markdown format)
53
+ - {{Emoji02}} Bulletpoint02 (as required and as many as required in markdown format)
54
+ - {{Emoji02.1}} Bulletpoint02.1 (as required and as many as sub levels required in markdown format)
55
+ - {{Emoji02.2}} Bulletpoint02.2 (as required and as many as sub levels required in markdown format)
56
+ - {{Emoji02.2.1}} Bulletpoint02.2.1 (as required and as many as sub levels required in markdown format)
57
+ - {{Emoji02.2.2}} Bulletpoint02.2.2 (as required and as many as sub levels required in markdown format)
58
+ - {{Emoji02.2.3}} Bulletpoint02.2.3 (as required and as many as sub levels required in markdown format)
59
+ - {{Emoji02.2.4}} Bulletpoint02.2.4 (as required and as many as sub levels required in markdown format)
60
+ ## {{Subtitle02}} (as required and as many as required in markdown format)
61
+ - {{Emoji03}} Bulletpoint03 (as required and as many as required in markdown format)
62
+ - {{Emoji03.1}} Bulletpoint03.1 (as required and as many as sub levels required in markdown format)
63
+ - {{Emoji03.2}} Bulletpoint03.2 (as required and as many as sub levels required in markdown format)
64
+ - {{Emoji03.2.1}} Bulletpoint03.2.1 (as required and as many as sub levels required in markdown format)
65
+ - {{Emoji03.2.2}} Bulletpoint03.2.2 (as required and as many as sub levels required in markdown format)
66
+ - {{Emoji04}} Bulletpoint04 (as required and as many as required in markdown format)
67
+ - {{Emoji04.1}} Bulletpoint04.1 (as required and as many as sub levels required in markdown format)
68
+ - {{Emoji04.1.1}} Bulletpoint04.1.1 (as required and as many as sub levels required in markdown format)
69
+ - {{Emoji04.1.2}} Bulletpoint04.1.2 (as required and as many as sub levels required in markdown format)
70
+ - {{Emoji04.2}} Bulletpoint04.2 (as required and as many as sub levels required in markdown format)
71
+ - {{Emoji04.2.1}} Bulletpoint04.2.1 (as required and as many as sub levels required in markdown format)
72
+ - {{Emoji04.2.2}} Bulletpoint04.2.2 (as required and as many as sub levels required in markdown format)
73
+ Summarize the text \"{final_text}\" to generate a elaborated hierarchical mindmap structure (any node in the mindmap should not exceed 10-12 words, also generate additional headings that aren't present in document if required for elaborative explaination) markdown using the \"en\" language 0.3 times the length of the original research paper. Do not include anything in the response, that is not the part of mindmap
74
+ '''
75
+ return prompt
76
+
77
+ def generate_mindmap_structure(llm, prompt):
78
+ response = llm.create_chat_completion(
79
+ messages = [
80
+ {'role':'user',
81
+ 'content': prompt}
82
+ ],
83
+ temperature=0.7,
84
+ top_k=200,
85
+ top_p=3.0,
86
+ )
87
+ mindmap_data = response['choices'][0]['message']['content']
88
+ print(mindmap_data)
89
+ return mindmap_data
90
+
91
+ def generate_markdown(llm, file):
92
+ final_text = get_text_from_pdf(file)
93
+ prompt = generate_prompt(final_text)
94
+ mindmap_markdown = generate_mindmap_structure(llm, prompt)
95
+ return mindmap_markdown
generate_mindmap.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from graphviz import Digraph
2
+ import re
3
+ import random
4
+
5
+ def parse_markdown_to_dict(md_text):
6
+ lines = md_text.strip().splitlines()
7
+ mindmap = {}
8
+ stack = []
9
+ for line in lines:
10
+ heading_match = re.match(r'^(#{1,6})\s+(.*)', line)
11
+ bullet_match = re.match(r'^\s*-\s+(.*)', line)
12
+ if heading_match:
13
+ level = len(heading_match.group(1))
14
+ title = heading_match.group(2).strip()
15
+ node = {'title': title, 'children': []}
16
+ while len(stack) >= level:
17
+ stack.pop()
18
+ if stack:
19
+ stack[-1]['children'].append(node)
20
+ else:
21
+ mindmap = node
22
+ stack.append(node)
23
+ elif bullet_match and stack:
24
+ stack[-1]['children'].append({'title': bullet_match.group(1), 'children': []})
25
+ return mindmap
26
+
27
+ generated_colors = set()
28
+
29
+ def generate_random_color():
30
+ """Generate a random color that hasn't been generated before."""
31
+ while True:
32
+ # Generate a random color in hex format
33
+ color = "#{:02x}{:02x}{:02x}".format(random.randint(128, 255), random.randint(128, 255), random.randint(128, 255))
34
+ # If the color is not in the set, it's unique
35
+ if color not in generated_colors:
36
+ generated_colors.add(color) # Add the color to the set of generated colors
37
+ return color # Return the unique color
38
+ else:
39
+ continue # Try again
40
+
41
+ def brighten_color(color, factor=0.15):
42
+ """Brighten the color by a certain factor (default 10%)"""
43
+ # Remove the '#' symbol
44
+ color = color.lstrip('#')
45
+
46
+ # Convert hex to RGB
47
+ r, g, b = [int(color[i:i+2], 16) for i in (0, 2, 4)]
48
+
49
+ # Increase each component by the factor, but clamp to 255
50
+ r = min(255, int(r * (1 + factor)))
51
+ g = min(255, int(g * (1 + factor)))
52
+ b = min(255, int(b * (1 + factor)))
53
+
54
+ # Convert back to hex
55
+ return "#{:02x}{:02x}{:02x}".format(r, g, b)
56
+
57
+ def add_nodes_to_graph(graph, node, parent_id=None, font_size=9, parent_color=None):
58
+ node_id = str(id(node))
59
+ title = node['title']
60
+ if parent_color is None:
61
+ node_color = "#ADD8E6" # Light Blue for the main heading
62
+ border_color = "#000000" # Dark Blue border for the main heading
63
+ parent_color = "#ADD8E6"
64
+ elif parent_color == "#ADD8E6":
65
+ node_color = generate_random_color()
66
+ border_color = "#808080"
67
+ parent_color = node_color
68
+ else:
69
+ # Child node and its descendants with the same random color
70
+ node_color = brighten_color(parent_color, factor=0.15)
71
+ border_color = "#808080"
72
+ # Check for markdown links
73
+ url_match = re.search(r'\[(.*?)\]\((.*?)\)', title)
74
+ if url_match:
75
+ prefix_text = title[:url_match.start()].strip()
76
+ display_text = url_match.group(1)
77
+ url = url_match.group(2)
78
+
79
+ label = f'{prefix_text} {display_text}'
80
+ graph.node(node_id, label=label, shape="box", style="rounded,filled", color=border_color, fontcolor="black", fillcolor=node_color, href=url, tooltip=title, fontsize=str(font_size))
81
+ else:
82
+ graph.node(node_id, title, shape="box", style="rounded,filled", color=border_color, fontcolor="black", fillcolor=node_color, tooltip=title, fontsize=str(font_size))
83
+
84
+ if parent_id:
85
+ graph.edge(parent_id, node_id)
86
+
87
+ # Recurse to children, passing down color for the child and its descendants
88
+ for child in node.get('children', []):
89
+ # Assign a random color to each child node (no inheritance from parent)
90
+ add_nodes_to_graph(graph, child, node_id, font_size=max(8, font_size - 1), parent_color=parent_color)
91
+
92
+ def generate_mindmap_svg(md_text):
93
+ mindmap_dict = parse_markdown_to_dict(md_text)
94
+ root_title = mindmap_dict.get('title', 'Mindmap')
95
+ sanitized_title = re.sub(r'[^a-zA-Z0-9_\-]', '', root_title.replace(" ", ""))
96
+ if output_filename is None:
97
+ output_filename = sanitized_title
98
+ graph = Digraph(format='svg')
99
+ graph.attr(rankdir='LR', size='10,10!', pad="0.5", margin="0.2", ratio="auto")
100
+ graph.attr('node', fontname="Arial", fontsize="9")
101
+ add_nodes_to_graph(graph, mindmap_dict)
102
+ svg_content = graph.pipe(format='svg').decode('utf-8')
103
+ # Replace %3 with the sanitized filename in the SVG content
104
+ svg_content = svg_content.replace("%3", root_title)
105
+ # Save the modified SVG content to a file
106
+ with open(f'{output_filename}.svg', 'w') as f:
107
+ f.write(svg_content)
108
+ return f"{output_filename}".svg
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.5.0
2
+ langchain==0.3.7
3
+ langchain-community==0.3.7
4
+ graphviz==0.20.3
5
+ llama-cpp-python==0.3.1