raannakasturi commited on
Commit
a1e6c4a
1 Parent(s): e785663

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +90 -60
  2. generate_markdown.py +16 -5
  3. generate_mindmap.py +48 -43
app.py CHANGED
@@ -1,60 +1,90 @@
1
- import os
2
- import sys
3
- from generate_markdown import load_llm_model, generate_markdown
4
- from generate_mindmap import generate_mindmap_svg
5
- import gradio as gr
6
- import subprocess
7
-
8
- def generate(file):
9
- print(f"Generating mindmap for {file}")
10
- summary = "This is a summary of the research paper"
11
- mindmap_markdown = generate_markdown(llm, file)
12
- mindmap_svg = generate_mindmap_svg(mindmap_markdown)
13
- print("Mindmap generated successfully")
14
- return summary, mindmap_markdown, mindmap_svg
15
-
16
- theme = gr.themes.Soft(
17
- primary_hue="purple",
18
- secondary_hue="cyan",
19
- neutral_hue="slate",
20
- font=[gr.themes.GoogleFont('Syne'), gr.themes.GoogleFont('poppins'), gr.themes.GoogleFont('poppins'), gr.themes.GoogleFont('poppins')],
21
- )
22
-
23
- with gr.Blocks(theme=theme, title="Binary Biology") as app:
24
- file = gr.File(file_count='single', label='Upload Research Paper PDF file', file_types=['.pdf'])
25
- summary = gr.TextArea(label='Summary', lines=5, interactive=False, show_copy_button=True)
26
- markdown_mindmap = gr.Textbox(label='Mindmap', lines=5, interactive=False, show_copy_button=True)
27
- graphical_mindmap = gr.Image(label='Graphical Mindmap', interactive=False, show_download_button=True, format='svg')
28
- submit = gr.Button(value='Submit')
29
-
30
- submit.click(generate,
31
- inputs=[file],
32
- outputs=[summary, markdown_mindmap, graphical_mindmap],
33
- scroll_to_output=True,
34
- show_progress=True,
35
- queue=True,
36
- )
37
-
38
- if __name__ == "__main__":
39
- try:
40
- env = os.environ.copy()
41
- env["CMAKE_ARGS"] = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
42
- cmd = ["pip", "install", "llama-cpp-python"]
43
- subprocess.run(cmd, env=env)
44
- except:
45
- cmd = ["pip", "install", "llama-cpp-python"]
46
- subprocess.run(cmd)
47
- try:
48
- try:
49
- subprocess.run(['apt', 'install', '-y', 'graphviz'])
50
- print("Graphviz installed successfully")
51
- except:
52
- subprocess.run(['sudo', 'apt', 'install', '-y', 'graphviz'])
53
- print("Graphviz installed successfully using sudo")
54
- except:
55
- print("Graphviz installation failed")
56
- sys.exit(1)
57
- print("Graphviz loaded successfully")
58
- llm = load_llm_model()
59
- print("Model loaded successfully")
60
- app.queue(default_concurrency_limit=10).launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from generate_markdown import load_llm_model, generate_markdown, sanitize_markdown
4
+ from generate_mindmap import generate_mindmap
5
+ import gradio as gr
6
+ import subprocess
7
+
8
+ def generate(file):
9
+ print(f"Generating mindmap for {file.name}")
10
+ unformatted_markdown = True
11
+ summary = "This is a summary of the research paper"
12
+ mindmap_markdown = generate_markdown(llm, file)
13
+ print('mindmap_markdown:', mindmap_markdown)
14
+
15
+ while unformatted_markdown:
16
+ if mindmap_markdown.startswith("#") and '-' in mindmap_markdown:
17
+ unformatted_markdown = False
18
+ mindmap_svg, mindmap_pdf = generate_mindmap(mindmap_markdown)
19
+ else:
20
+ unformatted_markdown = True
21
+ mindmap_markdown = sanitize_markdown(llm, mindmap_markdown)
22
+
23
+ print("Mindmap generated successfully")
24
+ return summary, mindmap_markdown, mindmap_svg, mindmap_svg, mindmap_pdf, mindmap_pdf
25
+
26
+ theme = gr.themes.Soft(
27
+ primary_hue="purple",
28
+ secondary_hue="cyan",
29
+ neutral_hue="slate",
30
+ font=[gr.themes.GoogleFont('Syne'), gr.themes.GoogleFont('Poppins')],
31
+ )
32
+
33
+ with gr.Blocks(theme=theme, title="Binary Biology") as app:
34
+ with gr.Row():
35
+ file = gr.File(file_count='single', label='Upload Research Paper PDF file', file_types=['.pdf'])
36
+ with gr.Column():
37
+ submit = gr.Button(value='Submit')
38
+ clear = gr.ClearButton(value='Clear')
39
+
40
+ with gr.Row():
41
+ summary = gr.TextArea(label='Summary', lines=5, interactive=False, show_copy_button=True)
42
+ markdown_mindmap = gr.Textbox(label='Mindmap', lines=5, interactive=False, show_copy_button=True)
43
+
44
+ with gr.Row():
45
+ with gr.Column():
46
+ svg_mindmap = gr.File(label='Graphical SVG Mindmap', interactive=False)
47
+ download_svg_mindmap = gr.File(label='Download SVG Mindmap', interactive=False)
48
+
49
+ with gr.Column():
50
+ pdf_mindmap = gr.File(label='Graphical PDF Mindmap', interactive=False)
51
+ download_pdf_mindmap = gr.File(label='Download PDF Mindmap', interactive=False)
52
+
53
+ submit.click(
54
+ generate,
55
+ inputs=[file],
56
+ outputs=[summary, markdown_mindmap, svg_mindmap, download_svg_mindmap, pdf_mindmap, download_pdf_mindmap],
57
+ scroll_to_output=True,
58
+ show_progress='minimal',
59
+ queue=True,
60
+ )
61
+
62
+ clear.click(
63
+ lambda: (None, None, None, None, None, None),
64
+ inputs=[file],
65
+ outputs=[summary, markdown_mindmap, svg_mindmap, download_svg_mindmap, pdf_mindmap, download_pdf_mindmap]
66
+ )
67
+
68
+ if __name__ == "__main__":
69
+ try:
70
+ env = os.environ.copy()
71
+ env["CMAKE_ARGS"] = "-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
72
+ subprocess.run(["pip", "install", "llama-cpp-python"], env=env)
73
+ except Exception as e:
74
+ print(f"Failed to install llama-cpp-python: {e}")
75
+
76
+ try:
77
+ subprocess.run(['apt', 'install', '-y', 'graphviz'])
78
+ print("Graphviz installed successfully")
79
+ except Exception:
80
+ try:
81
+ subprocess.run(['sudo', 'apt', 'install', '-y', 'graphviz'])
82
+ print("Graphviz installed successfully using sudo")
83
+ except:
84
+ print("Graphviz installation failed")
85
+ sys.exit(1)
86
+
87
+ print("Graphviz loaded successfully")
88
+ llm = load_llm_model()
89
+ print("Model loaded successfully")
90
+ app.queue(default_concurrency_limit=1).launch(show_error=True)
generate_markdown.py CHANGED
@@ -6,10 +6,9 @@ def load_llm_model():
6
  try:
7
  llm = Llama(
8
  model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
9
- # n_gpu_layers = 40,
10
- n_ctx=130000,
11
- n_batch=1024,
12
- # main_gpu=0
13
  )
14
  print("LLM model loaded successfully")
15
  return llm
@@ -34,7 +33,7 @@ def get_text_from_pdf(file):
34
  break
35
  else:
36
  research_paper = research_paper + text
37
- return research_paper[:100000]
38
 
39
  def generate_prompt(research_paper):
40
  prompt = f'''
@@ -61,4 +60,16 @@ def generate_markdown(llm, file):
61
  final_text = get_text_from_pdf(file)
62
  prompt = generate_prompt(final_text)
63
  mindmap_markdown = generate_mindmap_structure(llm, prompt)
 
 
 
 
 
64
  return mindmap_markdown
 
 
 
 
 
 
 
 
6
  try:
7
  llm = Llama(
8
  model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
9
+ n_gpu_layers = -1,
10
+ n_ctx=100000,
11
+ n_batch=4096,
 
12
  )
13
  print("LLM model loaded successfully")
14
  return llm
 
33
  break
34
  else:
35
  research_paper = research_paper + text
36
+ return research_paper[:10000]
37
 
38
  def generate_prompt(research_paper):
39
  prompt = f'''
 
60
  final_text = get_text_from_pdf(file)
61
  prompt = generate_prompt(final_text)
62
  mindmap_markdown = generate_mindmap_structure(llm, prompt)
63
+ if "**" in mindmap_markdown:
64
+ mindmap_markdown = mindmap_markdown.replace("- **", "### ")
65
+ mindmap_markdown = mindmap_markdown.replace("**", "")
66
+ else:
67
+ pass
68
  return mindmap_markdown
69
+
70
+ def sanitize_markdown(llm, mindmap_markdown):
71
+ prompt = f'''
72
+ As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."
73
+ '''
74
+ sanitized_markdown = generate_mindmap_structure(llm, prompt)
75
+ return sanitized_markdown
generate_mindmap.py CHANGED
@@ -1,4 +1,5 @@
1
  from graphviz import Digraph
 
2
  import re
3
  import random
4
 
@@ -89,6 +90,11 @@ def add_nodes_to_graph(graph, node, parent_id=None, font_size=9, parent_color=No
89
  # Assign a random color to each child node (no inheritance from parent)
90
  add_nodes_to_graph(graph, child, node_id, font_size=max(8, font_size - 1), parent_color=parent_color)
91
 
 
 
 
 
 
92
  def generate_mindmap_svg(md_text):
93
  mindmap_dict = parse_markdown_to_dict(md_text)
94
  root_title = mindmap_dict.get('title', 'Mindmap')
@@ -102,63 +108,62 @@ def generate_mindmap_svg(md_text):
102
  # Replace %3 with the sanitized filename in the SVG content
103
  svg_content = svg_content.replace("%3", root_title)
104
  # Save the modified SVG content to a file
105
- with open(f'{output_filename}.svg', 'w') as f:
106
  f.write(svg_content)
107
- return f"{output_filename}"
108
 
 
 
 
 
109
 
110
  # md = '''
111
- # Here is a mind map summarizing the topic of combining machine learning (ML) and computational chemistry (CompChem) for predictive insights into chemical systems:
112
-
113
- # **I. Introduction**
114
-
115
- # * Machine learning (ML) poised to transform chemical sciences
116
- # * Combining ML and CompChem for predictive insights
117
-
118
- # **II. Computational Chemistry (CompChem)**
119
-
120
- # * Computational quantum chemistry (CQChem)
121
- # * Methods for generating data sets (e.g., wavefunction theory, correlated wavefunction methods, density functional theory)
122
- # * Representations of systems (e.g., simple, complex, ambiguous)
123
 
124
- # **III. Wavefunction Theory Methods**
125
 
126
- # * Nonrelativistic time-independent Schrödinger equation
127
- # * Electronic Schrödinger equation
128
- # * Hartree-Fock (HF) approach
129
- # * Correlated wavefunction methods (e.g., extended Hückel theory, neglect of diatomic differential overlap)
130
 
131
- # **IV. Density Functional Theory (DFT)**
132
 
133
- # * Kinetic energy (KE-) or orbital-free (OF-) DFT
134
- # * Exchange-correlation functional (EC)
135
- # * Kohn-Sham (KS-) DFT
136
- # * Semiempirical methods (e.g., extended Hückel theory, neglect of diatomic differential overlap)
 
 
137
 
138
- # **V. Semiempirical Methods**
139
 
140
- # * Extended Hückel theory
141
- # * Neglect of diatomic differential overlap
142
- # * Semiempirical bond-order potentials (BOPs)
143
- # * Semiempirical nuclear quantum effects (NQEs)
 
 
144
 
145
- # **VI. Response Properties**
146
 
147
- # * Nuclear forces (e.g., F = -Π)
148
- # * Hessian calculations (e.g., second derivative of energy with respect to nuclear positions)
149
- # * Energy conserving forces (e.g., dipole moments)
 
 
 
150
 
151
- # **VII. Applications of ML in CompChem**
152
 
153
- # * Predicting molecular and material properties
154
- # * Predicting chemical reactions and processes
155
- # * Predicting materials properties (e.g., conductivity, optical properties)
156
- # * Predicting drug design and development
157
 
158
- # **VIII. Future Directions**
159
 
160
- # * Developing more accurate ML models for CompChem
161
- # * Improving the transferability of ML models between different systems
162
- # * Using ML to accelerate and improve the discovery of new materials and compounds
163
- # '''
164
  # generate_mindmap_svg(md)
 
1
  from graphviz import Digraph
2
+ from cairosvg import svg2pdf
3
  import re
4
  import random
5
 
 
90
  # Assign a random color to each child node (no inheritance from parent)
91
  add_nodes_to_graph(graph, child, node_id, font_size=max(8, font_size - 1), parent_color=parent_color)
92
 
93
+ def generate_mindmap_pdf(svg_file):
94
+ pdf_file = svg_file.replace(".svg", ".pdf")
95
+ svg2pdf(file_obj=open(svg_file, "rb"), write_to=pdf_file)
96
+ return pdf_file
97
+
98
  def generate_mindmap_svg(md_text):
99
  mindmap_dict = parse_markdown_to_dict(md_text)
100
  root_title = mindmap_dict.get('title', 'Mindmap')
 
108
  # Replace %3 with the sanitized filename in the SVG content
109
  svg_content = svg_content.replace("%3", root_title)
110
  # Save the modified SVG content to a file
111
+ with open(output_filename, 'w') as f:
112
  f.write(svg_content)
113
+ return output_filename
114
 
115
+ def generate_mindmap(md_text):
116
+ mindmap_svg = generate_mindmap_svg(md_text)
117
+ mindmap_pdf = generate_mindmap_pdf(mindmap_svg)
118
+ return mindmap_svg, mindmap_pdf
119
 
120
  # md = '''
121
+ # # Nucleic Acids Research: Updates to SAbDab and SAbDab-nano
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ # ## Introduction
124
 
125
+ # - **Antibodies in the Age of Biotherapeutics**
126
+ # - Antibodies are fundamental components of the immune system
127
+ # - Represent the largest class of biotherapeutics
128
+ # - Ability to bind to antigen targets with high affinity and specificity makes them promising candidates for development of therapeutic antibodies against various targets, including cancer, virus, and other diseases.
129
 
130
+ # ## Updates to Data Annotation
131
 
132
+ # - **Search Interface**
133
+ # - SAbDab can now be searched via a Flask app served by a fast SQL backend
134
+ # - Structures can be searched based on experimental method, resolution, species, type of antigen, presence of affinity values, and presence of amino acid residues at specific sequence positions defined using the Chothia numbering scheme.
135
+ # - **Auxiliary Databases**
136
+ # - Thera-SAbDab and CoV-AbDab databases contain antibody sequence information linking to relevant entries in SAbDab
137
+ # - These databases contain antibody sequence information, linking to relevant entries in SAbDab where structures of the antibody in question (CoV-AbDab) or structures with at least 95% sequence identity (Thera-SAbDab) exist.
138
 
139
+ # ## Updates to Data Access
140
 
141
+ # - **Search Features**
142
+ # - New search features were added to improve the ability to create task-specific nanobody and antibody datasets
143
+ # - Free-text keyword query can be performed over certain annotation fields (antigen, species, publication, and structure title)
144
+ # - **Download Options**
145
+ # - Structures matching search queries can be downloaded in bulk as a zipped archive and a summary .csv file containing annotation data
146
+ # - Individual structures can be accessed via the structure viewer interface
147
 
148
+ # ## Updates to SAbDab-nano
149
 
150
+ # - **Development of SAbDab-nano**
151
+ # - SAbDab-nano is a subset of SAbDab containing nanobody structures
152
+ # - Entries for which at least one antibody has a heavy chain variable domain, but no light chain variable domain are added to SAbDab-nano
153
+ # - **Composition and Growth**
154
+ # - SAbDab-nano contains 823 structures (492 nanobodies) with non-redundant CDR sequences
155
+ # - Average 3.8 structures per week are added to SAbDab-nano over the first 36 weeks of 2021
156
 
157
+ # ## Comparison to Other Nanobody Resources
158
 
159
+ # - **Other Databases**
160
+ # - There are other databases compiling nanobody data, but no other resource provides nanobody structures in a continuously updated and comprehensively annotated format
161
+ # - Several databases compiling nanobody sequences (but not structures) from a variety of data sources
 
162
 
163
+ # ## Conclusion
164
 
165
+ # - **SAbDab and SAbDab-nano**
166
+ # - SAbDab continues to be updated weekly and represents the most thoroughly annotated antibody structure database from which researchers can quickly create custom datasets
167
+ # - Searching SAbDab is now more powerful and faster, with new connections to auxiliary databases that catalogue therapeutic and antigen-specific antibodies
168
+ # - SAbDab and SAbDab-nano can be accessed freely online under a CC-BY 4.0 license at opig.stats.ox.ac.uk/webapps/newsabdab/ or at opig.stats.ox.ac.uk/webapps/newsabdab/nano respectfully'''
169
  # generate_mindmap_svg(md)