raannakasturi commited on
Commit
084b8b9
1 Parent(s): 048803a

Update generate_markdown.py

Browse files
Files changed (1) hide show
  1. generate_markdown.py +74 -74
generate_markdown.py CHANGED
@@ -1,75 +1,75 @@
1
- from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- from langchain_community.document_loaders import PyPDFLoader
3
- from llama_cpp import Llama
4
-
5
- def load_llm_model():
6
- try:
7
- llm = Llama(
8
- model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
9
- n_gpu_layers = -1,
10
- n_ctx=100000,
11
- n_batch=4096,
12
- )
13
- print("LLM model loaded successfully")
14
- return llm
15
- except Exception as e:
16
- print(f"Error loading LLM model: {e}")
17
- raise
18
-
19
- def get_text_from_pdf(file):
20
- loader = PyPDFLoader(file)
21
- pages = loader.load_and_split()
22
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
23
- texts = text_splitter.split_documents(pages)
24
- final_text = ""
25
- for text in texts:
26
- if text.page_content.startswith("REFERENCES"):
27
- break
28
- else:
29
- final_text = final_text + text.page_content
30
- research_paper = ""
31
- for text in final_text:
32
- if text.startswith(("REFERENCES", "REFERENCESREFERENCES", "REFERENCESREFERENCESREFERENCES")):
33
- break
34
- else:
35
- research_paper = research_paper + text
36
- return research_paper[:10000]
37
-
38
- def generate_prompt(research_paper):
39
- prompt = f'''
40
- As a text script expert, please help me to write a short text script with the topic \\"{research_paper}\\".Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown.\\n Do not include anything in the response, that is not the part of mindmap.\\n Importantly your output must use language \\"English\\""
41
- '''
42
- return prompt
43
-
44
- def generate_mindmap_structure(llm, prompt):
45
- response = llm.create_chat_completion(
46
- messages = [
47
- {'role':'system',
48
- 'content': 'You are a helpful research assistant for generating well-formatted mindmaps in MarkDown format from scientific research papers.'},
49
- {'role':'user',
50
- 'content': prompt}
51
- ],
52
- temperature=0.7,
53
- top_k=200,
54
- top_p=3.0,
55
- )
56
- mindmap_data = response['choices'][0]['message']['content']
57
- return mindmap_data
58
-
59
- def generate_markdown(llm, file):
60
- final_text = get_text_from_pdf(file)
61
- prompt = generate_prompt(final_text)
62
- mindmap_markdown = generate_mindmap_structure(llm, prompt)
63
- if "**" in mindmap_markdown:
64
- mindmap_markdown = mindmap_markdown.replace("- **", "### ")
65
- mindmap_markdown = mindmap_markdown.replace("**", "")
66
- else:
67
- pass
68
- return mindmap_markdown
69
-
70
- def sanitize_markdown(llm, mindmap_markdown):
71
- prompt = f'''
72
- As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."
73
- '''
74
- sanitized_markdown = generate_mindmap_structure(llm, prompt)
75
  return sanitized_markdown
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from llama_cpp import Llama
4
+
5
+ def load_llm_model():
6
+ try:
7
+ llm = Llama(
8
+ model_path="/home/user/app/Llama-3.2-1B-Instruct-Q8_0.gguf",
9
+ n_gpu_layers = -1,
10
+ n_ctx=100000,
11
+ n_batch=4096,
12
+ )
13
+ print("LLM model loaded successfully")
14
+ return llm
15
+ except Exception as e:
16
+ print(f"Error loading LLM model: {e}")
17
+ raise
18
+
19
+ def get_text_from_pdf(file):
20
+ loader = PyPDFLoader(file)
21
+ pages = loader.load_and_split()
22
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
23
+ texts = text_splitter.split_documents(pages)
24
+ final_text = ""
25
+ for text in texts:
26
+ if text.page_content.startswith("REFERENCES"):
27
+ break
28
+ else:
29
+ final_text = final_text + text.page_content
30
+ research_paper = ""
31
+ for text in final_text:
32
+ if text.startswith(("REFERENCES", "REFERENCESREFERENCES", "REFERENCESREFERENCESREFERENCES")):
33
+ break
34
+ else:
35
+ research_paper = research_paper + text
36
+ return research_paper[:10000]
37
+
38
+ def generate_prompt(research_paper):
39
+ prompt = f'''
40
+ As a text script expert, please help me to write a short text script with the topic \\"{research_paper}\\".Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown.\\n Do not include anything in the response, that is not the part of mindmap.\\n Importantly your output must use language \\"English\\""
41
+ '''
42
+ return prompt
43
+
44
+ def generate_mindmap_structure(llm, prompt):
45
+ response = llm.create_chat_completion(
46
+ messages = [
47
+ {'role':'system',
48
+ 'content': 'You are a helpful research assistant for generating well-formatted mindmaps in MarkDown format from scientific research papers.'},
49
+ {'role':'user',
50
+ 'content': prompt}
51
+ ],
52
+ temperature=0.7,
53
+ top_k=200,
54
+ top_p=3.0,
55
+ )
56
+ mindmap_data = response['choices'][0]['message']['content']
57
+ return mindmap_data
58
+
59
+ def generate_markdown(llm, file):
60
+ final_text = get_text_from_pdf(file)
61
+ prompt = generate_prompt(final_text)
62
+ mindmap_markdown = generate_mindmap_structure(llm, prompt)
63
+ if "**" in mindmap_markdown:
64
+ mindmap_markdown = mindmap_markdown.replace("- **", "### ")
65
+ mindmap_markdown = mindmap_markdown.replace("**", "")
66
+ else:
67
+ pass
68
+ return mindmap_markdown
69
+
70
+ def sanitize_markdown(llm, mindmap_markdown):
71
+ prompt = f'''
72
+ As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."
73
+ '''
74
+ sanitized_markdown = generate_mindmap_structure(llm, prompt)
75
  return sanitized_markdown