Spaces:
Runtime error
Runtime error
raannakasturi
commited on
Commit
•
a1e6c4a
1
Parent(s):
e785663
Upload 3 files
Browse files- app.py +90 -60
- generate_markdown.py +16 -5
- generate_mindmap.py +48 -43
app.py
CHANGED
@@ -1,60 +1,90 @@
|
|
1 |
-
import os
|
2 |
-
import sys
|
3 |
-
from generate_markdown import load_llm_model, generate_markdown
|
4 |
-
from generate_mindmap import
|
5 |
-
import gradio as gr
|
6 |
-
import subprocess
|
7 |
-
|
8 |
-
def generate(file):
|
9 |
-
print(f"Generating mindmap for {file}")
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
print(
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
)
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from generate_markdown import load_llm_model, generate_markdown, sanitize_markdown
|
4 |
+
from generate_mindmap import generate_mindmap
|
5 |
+
import gradio as gr
|
6 |
+
import subprocess
|
7 |
+
|
8 |
+
def generate(file):
|
9 |
+
print(f"Generating mindmap for {file.name}")
|
10 |
+
unformatted_markdown = True
|
11 |
+
summary = "This is a summary of the research paper"
|
12 |
+
mindmap_markdown = generate_markdown(llm, file)
|
13 |
+
print('mindmap_markdown:', mindmap_markdown)
|
14 |
+
|
15 |
+
while unformatted_markdown:
|
16 |
+
if mindmap_markdown.startswith("#") and '-' in mindmap_markdown:
|
17 |
+
unformatted_markdown = False
|
18 |
+
mindmap_svg, mindmap_pdf = generate_mindmap(mindmap_markdown)
|
19 |
+
else:
|
20 |
+
unformatted_markdown = True
|
21 |
+
mindmap_markdown = sanitize_markdown(llm, mindmap_markdown)
|
22 |
+
|
23 |
+
print("Mindmap generated successfully")
|
24 |
+
return summary, mindmap_markdown, mindmap_svg, mindmap_svg, mindmap_pdf, mindmap_pdf
|
25 |
+
|
26 |
+
theme = gr.themes.Soft(
|
27 |
+
primary_hue="purple",
|
28 |
+
secondary_hue="cyan",
|
29 |
+
neutral_hue="slate",
|
30 |
+
font=[gr.themes.GoogleFont('Syne'), gr.themes.GoogleFont('Poppins')],
|
31 |
+
)
|
32 |
+
|
33 |
+
with gr.Blocks(theme=theme, title="Binary Biology") as app:
|
34 |
+
with gr.Row():
|
35 |
+
file = gr.File(file_count='single', label='Upload Research Paper PDF file', file_types=['.pdf'])
|
36 |
+
with gr.Column():
|
37 |
+
submit = gr.Button(value='Submit')
|
38 |
+
clear = gr.ClearButton(value='Clear')
|
39 |
+
|
40 |
+
with gr.Row():
|
41 |
+
summary = gr.TextArea(label='Summary', lines=5, interactive=False, show_copy_button=True)
|
42 |
+
markdown_mindmap = gr.Textbox(label='Mindmap', lines=5, interactive=False, show_copy_button=True)
|
43 |
+
|
44 |
+
with gr.Row():
|
45 |
+
with gr.Column():
|
46 |
+
svg_mindmap = gr.File(label='Graphical SVG Mindmap', interactive=False)
|
47 |
+
download_svg_mindmap = gr.File(label='Download SVG Mindmap', interactive=False)
|
48 |
+
|
49 |
+
with gr.Column():
|
50 |
+
pdf_mindmap = gr.File(label='Graphical PDF Mindmap', interactive=False)
|
51 |
+
download_pdf_mindmap = gr.File(label='Download PDF Mindmap', interactive=False)
|
52 |
+
|
53 |
+
submit.click(
|
54 |
+
generate,
|
55 |
+
inputs=[file],
|
56 |
+
outputs=[summary, markdown_mindmap, svg_mindmap, download_svg_mindmap, pdf_mindmap, download_pdf_mindmap],
|
57 |
+
scroll_to_output=True,
|
58 |
+
show_progress='minimal',
|
59 |
+
queue=True,
|
60 |
+
)
|
61 |
+
|
62 |
+
clear.click(
|
63 |
+
lambda: (None, None, None, None, None, None),
|
64 |
+
inputs=[file],
|
65 |
+
outputs=[summary, markdown_mindmap, svg_mindmap, download_svg_mindmap, pdf_mindmap, download_pdf_mindmap]
|
66 |
+
)
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
try:
|
70 |
+
env = os.environ.copy()
|
71 |
+
env["CMAKE_ARGS"] = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
|
72 |
+
subprocess.run(["pip", "install", "llama-cpp-python"], env=env)
|
73 |
+
except Exception as e:
|
74 |
+
print(f"Failed to install llama-cpp-python: {e}")
|
75 |
+
|
76 |
+
try:
|
77 |
+
subprocess.run(['apt', 'install', '-y', 'graphviz'])
|
78 |
+
print("Graphviz installed successfully")
|
79 |
+
except Exception:
|
80 |
+
try:
|
81 |
+
subprocess.run(['sudo', 'apt', 'install', '-y', 'graphviz'])
|
82 |
+
print("Graphviz installed successfully using sudo")
|
83 |
+
except:
|
84 |
+
print("Graphviz installation failed")
|
85 |
+
sys.exit(1)
|
86 |
+
|
87 |
+
print("Graphviz loaded successfully")
|
88 |
+
llm = load_llm_model()
|
89 |
+
print("Model loaded successfully")
|
90 |
+
app.queue(default_concurrency_limit=1).launch(show_error=True)
|
generate_markdown.py
CHANGED
@@ -6,10 +6,9 @@ def load_llm_model():
|
|
6 |
try:
|
7 |
llm = Llama(
|
8 |
model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
|
9 |
-
|
10 |
-
n_ctx=
|
11 |
-
n_batch=
|
12 |
-
# main_gpu=0
|
13 |
)
|
14 |
print("LLM model loaded successfully")
|
15 |
return llm
|
@@ -34,7 +33,7 @@ def get_text_from_pdf(file):
|
|
34 |
break
|
35 |
else:
|
36 |
research_paper = research_paper + text
|
37 |
-
return research_paper[:
|
38 |
|
39 |
def generate_prompt(research_paper):
|
40 |
prompt = f'''
|
@@ -61,4 +60,16 @@ def generate_markdown(llm, file):
|
|
61 |
final_text = get_text_from_pdf(file)
|
62 |
prompt = generate_prompt(final_text)
|
63 |
mindmap_markdown = generate_mindmap_structure(llm, prompt)
|
|
|
|
|
|
|
|
|
|
|
64 |
return mindmap_markdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
try:
|
7 |
llm = Llama(
|
8 |
model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
|
9 |
+
n_gpu_layers = -1,
|
10 |
+
n_ctx=100000,
|
11 |
+
n_batch=4096,
|
|
|
12 |
)
|
13 |
print("LLM model loaded successfully")
|
14 |
return llm
|
|
|
33 |
break
|
34 |
else:
|
35 |
research_paper = research_paper + text
|
36 |
+
return research_paper[:10000]
|
37 |
|
38 |
def generate_prompt(research_paper):
|
39 |
prompt = f'''
|
|
|
60 |
final_text = get_text_from_pdf(file)
|
61 |
prompt = generate_prompt(final_text)
|
62 |
mindmap_markdown = generate_mindmap_structure(llm, prompt)
|
63 |
+
if "**" in mindmap_markdown:
|
64 |
+
mindmap_markdown = mindmap_markdown.replace("- **", "### ")
|
65 |
+
mindmap_markdown = mindmap_markdown.replace("**", "")
|
66 |
+
else:
|
67 |
+
pass
|
68 |
return mindmap_markdown
|
69 |
+
|
70 |
+
def sanitize_markdown(llm, mindmap_markdown):
|
71 |
+
prompt = f'''
|
72 |
+
As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."
|
73 |
+
'''
|
74 |
+
sanitized_markdown = generate_mindmap_structure(llm, prompt)
|
75 |
+
return sanitized_markdown
|
generate_mindmap.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from graphviz import Digraph
|
|
|
2 |
import re
|
3 |
import random
|
4 |
|
@@ -89,6 +90,11 @@ def add_nodes_to_graph(graph, node, parent_id=None, font_size=9, parent_color=No
|
|
89 |
# Assign a random color to each child node (no inheritance from parent)
|
90 |
add_nodes_to_graph(graph, child, node_id, font_size=max(8, font_size - 1), parent_color=parent_color)
|
91 |
|
|
|
|
|
|
|
|
|
|
|
92 |
def generate_mindmap_svg(md_text):
|
93 |
mindmap_dict = parse_markdown_to_dict(md_text)
|
94 |
root_title = mindmap_dict.get('title', 'Mindmap')
|
@@ -102,63 +108,62 @@ def generate_mindmap_svg(md_text):
|
|
102 |
# Replace %3 with the sanitized filename in the SVG content
|
103 |
svg_content = svg_content.replace("%3", root_title)
|
104 |
# Save the modified SVG content to a file
|
105 |
-
with open(
|
106 |
f.write(svg_content)
|
107 |
-
return
|
108 |
|
|
|
|
|
|
|
|
|
109 |
|
110 |
# md = '''
|
111 |
-
#
|
112 |
-
|
113 |
-
# **I. Introduction**
|
114 |
-
|
115 |
-
# * Machine learning (ML) poised to transform chemical sciences
|
116 |
-
# * Combining ML and CompChem for predictive insights
|
117 |
-
|
118 |
-
# **II. Computational Chemistry (CompChem)**
|
119 |
-
|
120 |
-
# * Computational quantum chemistry (CQChem)
|
121 |
-
# * Methods for generating data sets (e.g., wavefunction theory, correlated wavefunction methods, density functional theory)
|
122 |
-
# * Representations of systems (e.g., simple, complex, ambiguous)
|
123 |
|
124 |
-
#
|
125 |
|
126 |
-
#
|
127 |
-
#
|
128 |
-
#
|
129 |
-
#
|
130 |
|
131 |
-
#
|
132 |
|
133 |
-
#
|
134 |
-
#
|
135 |
-
#
|
136 |
-
#
|
|
|
|
|
137 |
|
138 |
-
#
|
139 |
|
140 |
-
#
|
141 |
-
#
|
142 |
-
#
|
143 |
-
#
|
|
|
|
|
144 |
|
145 |
-
#
|
146 |
|
147 |
-
#
|
148 |
-
#
|
149 |
-
#
|
|
|
|
|
|
|
150 |
|
151 |
-
#
|
152 |
|
153 |
-
#
|
154 |
-
#
|
155 |
-
#
|
156 |
-
# * Predicting drug design and development
|
157 |
|
158 |
-
#
|
159 |
|
160 |
-
#
|
161 |
-
#
|
162 |
-
#
|
163 |
-
# '''
|
164 |
# generate_mindmap_svg(md)
|
|
|
1 |
from graphviz import Digraph
|
2 |
+
from cairosvg import svg2pdf
|
3 |
import re
|
4 |
import random
|
5 |
|
|
|
90 |
# Assign a random color to each child node (no inheritance from parent)
|
91 |
add_nodes_to_graph(graph, child, node_id, font_size=max(8, font_size - 1), parent_color=parent_color)
|
92 |
|
93 |
+
def generate_mindmap_pdf(svg_file):
|
94 |
+
pdf_file = svg_file.replace(".svg", ".pdf")
|
95 |
+
svg2pdf(file_obj=open(svg_file, "rb"), write_to=pdf_file)
|
96 |
+
return pdf_file
|
97 |
+
|
98 |
def generate_mindmap_svg(md_text):
|
99 |
mindmap_dict = parse_markdown_to_dict(md_text)
|
100 |
root_title = mindmap_dict.get('title', 'Mindmap')
|
|
|
108 |
# Replace %3 with the sanitized filename in the SVG content
|
109 |
svg_content = svg_content.replace("%3", root_title)
|
110 |
# Save the modified SVG content to a file
|
111 |
+
with open(output_filename, 'w') as f:
|
112 |
f.write(svg_content)
|
113 |
+
return output_filename
|
114 |
|
115 |
+
def generate_mindmap(md_text):
|
116 |
+
mindmap_svg = generate_mindmap_svg(md_text)
|
117 |
+
mindmap_pdf = generate_mindmap_pdf(mindmap_svg)
|
118 |
+
return mindmap_svg, mindmap_pdf
|
119 |
|
120 |
# md = '''
|
121 |
+
# # Nucleic Acids Research: Updates to SAbDab and SAbDab-nano
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
# ## Introduction
|
124 |
|
125 |
+
# - **Antibodies in the Age of Biotherapeutics**
|
126 |
+
# - Antibodies are fundamental components of the immune system
|
127 |
+
# - Represent the largest class of biotherapeutics
|
128 |
+
# - Ability to bind to antigen targets with high affinity and specificity makes them promising candidates for development of therapeutic antibodies against various targets, including cancer, virus, and other diseases.
|
129 |
|
130 |
+
# ## Updates to Data Annotation
|
131 |
|
132 |
+
# - **Search Interface**
|
133 |
+
# - SAbDab can now be searched via a Flask app served by a fast SQL backend
|
134 |
+
# - Structures can be searched based on experimental method, resolution, species, type of antigen, presence of affinity values, and presence of amino acid residues at specific sequence positions defined using the Chothia numbering scheme.
|
135 |
+
# - **Auxiliary Databases**
|
136 |
+
# - Thera-SAbDab and CoV-AbDab databases contain antibody sequence information linking to relevant entries in SAbDab
|
137 |
+
# - These databases contain antibody sequence information, linking to relevant entries in SAbDab where structures of the antibody in question (CoV-AbDab) or structures with at least 95% sequence identity (Thera-SAbDab) exist.
|
138 |
|
139 |
+
# ## Updates to Data Access
|
140 |
|
141 |
+
# - **Search Features**
|
142 |
+
# - New search features were added to improve the ability to create task-specific nanobody and antibody datasets
|
143 |
+
# - Free-text keyword query can be performed over certain annotation fields (antigen, species, publication, and structure title)
|
144 |
+
# - **Download Options**
|
145 |
+
# - Structures matching search queries can be downloaded in bulk as a zipped archive and a summary .csv file containing annotation data
|
146 |
+
# - Individual structures can be accessed via the structure viewer interface
|
147 |
|
148 |
+
# ## Updates to SAbDab-nano
|
149 |
|
150 |
+
# - **Development of SAbDab-nano**
|
151 |
+
# - SAbDab-nano is a subset of SAbDab containing nanobody structures
|
152 |
+
# - Entries for which at least one antibody has a heavy chain variable domain, but no light chain variable domain are added to SAbDab-nano
|
153 |
+
# - **Composition and Growth**
|
154 |
+
# - SAbDab-nano contains 823 structures (492 nanobodies) with non-redundant CDR sequences
|
155 |
+
# - Average 3.8 structures per week are added to SAbDab-nano over the first 36 weeks of 2021
|
156 |
|
157 |
+
# ## Comparison to Other Nanobody Resources
|
158 |
|
159 |
+
# - **Other Databases**
|
160 |
+
# - There are other databases compiling nanobody data, but no other resource provides nanobody structures in a continuously updated and comprehensively annotated format
|
161 |
+
# - Several databases compiling nanobody sequences (but not structures) from a variety of data sources
|
|
|
162 |
|
163 |
+
# ## Conclusion
|
164 |
|
165 |
+
# - **SAbDab and SAbDab-nano**
|
166 |
+
# - SAbDab continues to be updated weekly and represents the most thoroughly annotated antibody structure database from which researchers can quickly create custom datasets
|
167 |
+
# - Searching SAbDab is now more powerful and faster, with new connections to auxiliary databases that catalogue therapeutic and antigen-specific antibodies
|
168 |
+
# - SAbDab and SAbDab-nano can be accessed freely online under a CC-BY 4.0 license at opig.stats.ox.ac.uk/webapps/newsabdab/ or at opig.stats.ox.ac.uk/webapps/newsabdab/nano respectfully'''
|
169 |
# generate_mindmap_svg(md)
|