raannakasturi commited on
Commit
af1f176
·
verified ·
1 Parent(s): 1641226

Update generate_markdown.py

Browse files
Files changed (1) hide show
  1. generate_markdown.py +95 -95
generate_markdown.py CHANGED
@@ -1,95 +1,95 @@
1
- from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- from langchain_community.document_loaders import PyPDFLoader
3
- from llama_cpp import Llama
4
- import llama_cpp
5
-
6
- def load_llm_model():
7
- llm = Llama(
8
- model_path="Llama-3.2-3B-Instruct-Q8_0.gguf",
9
- # n_gpu_layers = 20, # Uncomment for GPU
10
- n_ctx=200000,
11
- n_threads=16,
12
- n_batch=512,
13
- split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
14
- pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK,
15
- rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_LINEAR,
16
- # main_gpu=0 # Uncomment for GPU
17
- )
18
- return llm
19
-
20
- def get_text_from_pdf(file):
21
- loader = PyPDFLoader(file)
22
- pages = loader.load_and_split()
23
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
24
- texts = text_splitter.split_documents(pages)
25
- final_text = ""
26
- for text in texts:
27
- final_text = final_text + text.page_content
28
- print(f"Length of final text: {len(final_text)}")
29
- with open("final_text.txt", "w") as f:
30
- f.write(final_text)
31
- research_paper = ""
32
- for line in final_text.split("\n"):
33
- if line.startswith("REFERENCES"):
34
- break
35
- else:
36
- research_paper = research_paper + line + " "
37
- with open("research_paper.txt", "w") as f:
38
- f.write(research_paper)
39
- print(f"Length of research paper: {len(research_paper)}")
40
- return research_paper
41
-
42
- def generate_prompt(final_text):
43
- prompt = f'''
44
- You have been provided with a research paper in text format. Your task is to generate a mindmap structure in markdown format that summarizes the research paper.
45
- Your output should use the language \"en\" 0.3 times the length of the original research paper. Do not include anything in the response, that is not the part of mindmap and use the following template (any node in the mindmap should not exceed 10-12 words, also generate additional headings that aren't present in document if required for elaborative explaination):
46
- # {{Title}} (should be the title of the research paper)
47
- ## {{Subtitle01}} (as required and as many as required in markdown format)
48
- - {{Emoji01}} Bulletpoint01 (as required and as many as required in markdown format)
49
- - {{Emoji01.1}} Bulletpoint01.1 (as required and as many as sub levels required in markdown format)
50
- - {{Emoji01.1.1}} Bulletpoint01.1.1 (as required and as many as sub levels required in markdown format)
51
- - {{Emoji01.1.2}} Bulletpoint01.1.2 (as required and as many as sub levels required in markdown format)
52
- - {{Emoji01.2}} Bulletpoint01.2 (as required and as many as sub levels required in markdown format)
53
- - {{Emoji02}} Bulletpoint02 (as required and as many as required in markdown format)
54
- - {{Emoji02.1}} Bulletpoint02.1 (as required and as many as sub levels required in markdown format)
55
- - {{Emoji02.2}} Bulletpoint02.2 (as required and as many as sub levels required in markdown format)
56
- - {{Emoji02.2.1}} Bulletpoint02.2.1 (as required and as many as sub levels required in markdown format)
57
- - {{Emoji02.2.2}} Bulletpoint02.2.2 (as required and as many as sub levels required in markdown format)
58
- - {{Emoji02.2.3}} Bulletpoint02.2.3 (as required and as many as sub levels required in markdown format)
59
- - {{Emoji02.2.4}} Bulletpoint02.2.4 (as required and as many as sub levels required in markdown format)
60
- ## {{Subtitle02}} (as required and as many as required in markdown format)
61
- - {{Emoji03}} Bulletpoint03 (as required and as many as required in markdown format)
62
- - {{Emoji03.1}} Bulletpoint03.1 (as required and as many as sub levels required in markdown format)
63
- - {{Emoji03.2}} Bulletpoint03.2 (as required and as many as sub levels required in markdown format)
64
- - {{Emoji03.2.1}} Bulletpoint03.2.1 (as required and as many as sub levels required in markdown format)
65
- - {{Emoji03.2.2}} Bulletpoint03.2.2 (as required and as many as sub levels required in markdown format)
66
- - {{Emoji04}} Bulletpoint04 (as required and as many as required in markdown format)
67
- - {{Emoji04.1}} Bulletpoint04.1 (as required and as many as sub levels required in markdown format)
68
- - {{Emoji04.1.1}} Bulletpoint04.1.1 (as required and as many as sub levels required in markdown format)
69
- - {{Emoji04.1.2}} Bulletpoint04.1.2 (as required and as many as sub levels required in markdown format)
70
- - {{Emoji04.2}} Bulletpoint04.2 (as required and as many as sub levels required in markdown format)
71
- - {{Emoji04.2.1}} Bulletpoint04.2.1 (as required and as many as sub levels required in markdown format)
72
- - {{Emoji04.2.2}} Bulletpoint04.2.2 (as required and as many as sub levels required in markdown format)
73
- Summarize the text \"{final_text}\" to generate a elaborated hierarchical mindmap structure (any node in the mindmap should not exceed 10-12 words, also generate additional headings that aren't present in document if required for elaborative explaination) markdown using the \"en\" language 0.3 times the length of the original research paper. Do not include anything in the response, that is not the part of mindmap
74
- '''
75
- return prompt
76
-
77
- def generate_mindmap_structure(llm, prompt):
78
- response = llm.create_chat_completion(
79
- messages = [
80
- {'role':'user',
81
- 'content': prompt}
82
- ],
83
- temperature=0.7,
84
- top_k=200,
85
- top_p=3.0,
86
- )
87
- mindmap_data = response['choices'][0]['message']['content']
88
- print(mindmap_data)
89
- return mindmap_data
90
-
91
- def generate_markdown(llm, file):
92
- final_text = get_text_from_pdf(file)
93
- prompt = generate_prompt(final_text)
94
- mindmap_markdown = generate_mindmap_structure(llm, prompt)
95
- return mindmap_markdown
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from llama_cpp import Llama
4
+ import llama_cpp
5
+
6
+ def load_llm_model():
7
+ llm = Llama(
8
+ model_path="model.gguf",
9
+ # n_gpu_layers = 20, # Uncomment for GPU
10
+ n_ctx=200000,
11
+ n_threads=16,
12
+ n_batch=512,
13
+ split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
14
+ pooling_type=llama_cpp.LLAMA_POOLING_TYPE_RANK,
15
+ rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_LINEAR,
16
+ # main_gpu=0 # Uncomment for GPU
17
+ )
18
+ return llm
19
+
20
+ def get_text_from_pdf(file):
21
+ loader = PyPDFLoader(file)
22
+ pages = loader.load_and_split()
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
24
+ texts = text_splitter.split_documents(pages)
25
+ final_text = ""
26
+ for text in texts:
27
+ final_text = final_text + text.page_content
28
+ print(f"Length of final text: {len(final_text)}")
29
+ with open("final_text.txt", "w") as f:
30
+ f.write(final_text)
31
+ research_paper = ""
32
+ for line in final_text.split("\n"):
33
+ if line.startswith("REFERENCES"):
34
+ break
35
+ else:
36
+ research_paper = research_paper + line + " "
37
+ with open("research_paper.txt", "w") as f:
38
+ f.write(research_paper)
39
+ print(f"Length of research paper: {len(research_paper)}")
40
+ return research_paper
41
+
42
+ def generate_prompt(final_text):
43
+ prompt = f'''
44
+ You have been provided with a research paper in text format. Your task is to generate a mindmap structure in markdown format that summarizes the research paper.
45
+ Your output should use the language \"en\" 0.3 times the length of the original research paper. Do not include anything in the response, that is not the part of mindmap and use the following template (any node in the mindmap should not exceed 10-12 words, also generate additional headings that aren't present in document if required for elaborative explaination):
46
+ # {{Title}} (should be the title of the research paper)
47
+ ## {{Subtitle01}} (as required and as many as required in markdown format)
48
+ - {{Emoji01}} Bulletpoint01 (as required and as many as required in markdown format)
49
+ - {{Emoji01.1}} Bulletpoint01.1 (as required and as many as sub levels required in markdown format)
50
+ - {{Emoji01.1.1}} Bulletpoint01.1.1 (as required and as many as sub levels required in markdown format)
51
+ - {{Emoji01.1.2}} Bulletpoint01.1.2 (as required and as many as sub levels required in markdown format)
52
+ - {{Emoji01.2}} Bulletpoint01.2 (as required and as many as sub levels required in markdown format)
53
+ - {{Emoji02}} Bulletpoint02 (as required and as many as required in markdown format)
54
+ - {{Emoji02.1}} Bulletpoint02.1 (as required and as many as sub levels required in markdown format)
55
+ - {{Emoji02.2}} Bulletpoint02.2 (as required and as many as sub levels required in markdown format)
56
+ - {{Emoji02.2.1}} Bulletpoint02.2.1 (as required and as many as sub levels required in markdown format)
57
+ - {{Emoji02.2.2}} Bulletpoint02.2.2 (as required and as many as sub levels required in markdown format)
58
+ - {{Emoji02.2.3}} Bulletpoint02.2.3 (as required and as many as sub levels required in markdown format)
59
+ - {{Emoji02.2.4}} Bulletpoint02.2.4 (as required and as many as sub levels required in markdown format)
60
+ ## {{Subtitle02}} (as required and as many as required in markdown format)
61
+ - {{Emoji03}} Bulletpoint03 (as required and as many as required in markdown format)
62
+ - {{Emoji03.1}} Bulletpoint03.1 (as required and as many as sub levels required in markdown format)
63
+ - {{Emoji03.2}} Bulletpoint03.2 (as required and as many as sub levels required in markdown format)
64
+ - {{Emoji03.2.1}} Bulletpoint03.2.1 (as required and as many as sub levels required in markdown format)
65
+ - {{Emoji03.2.2}} Bulletpoint03.2.2 (as required and as many as sub levels required in markdown format)
66
+ - {{Emoji04}} Bulletpoint04 (as required and as many as required in markdown format)
67
+ - {{Emoji04.1}} Bulletpoint04.1 (as required and as many as sub levels required in markdown format)
68
+ - {{Emoji04.1.1}} Bulletpoint04.1.1 (as required and as many as sub levels required in markdown format)
69
+ - {{Emoji04.1.2}} Bulletpoint04.1.2 (as required and as many as sub levels required in markdown format)
70
+ - {{Emoji04.2}} Bulletpoint04.2 (as required and as many as sub levels required in markdown format)
71
+ - {{Emoji04.2.1}} Bulletpoint04.2.1 (as required and as many as sub levels required in markdown format)
72
+ - {{Emoji04.2.2}} Bulletpoint04.2.2 (as required and as many as sub levels required in markdown format)
73
+ Summarize the text \"{final_text}\" to generate a elaborated hierarchical mindmap structure (any node in the mindmap should not exceed 10-12 words, also generate additional headings that aren't present in document if required for elaborative explaination) markdown using the \"en\" language 0.3 times the length of the original research paper. Do not include anything in the response, that is not the part of mindmap
74
+ '''
75
+ return prompt
76
+
77
+ def generate_mindmap_structure(llm, prompt):
78
+ response = llm.create_chat_completion(
79
+ messages = [
80
+ {'role':'user',
81
+ 'content': prompt}
82
+ ],
83
+ temperature=0.7,
84
+ top_k=200,
85
+ top_p=3.0,
86
+ )
87
+ mindmap_data = response['choices'][0]['message']['content']
88
+ print(mindmap_data)
89
+ return mindmap_data
90
+
91
+ def generate_markdown(llm, file):
92
+ final_text = get_text_from_pdf(file)
93
+ prompt = generate_prompt(final_text)
94
+ mindmap_markdown = generate_mindmap_structure(llm, prompt)
95
+ return mindmap_markdown