Eric Botti commited on
Commit
8aa24e3
·
1 Parent(s): ea15890

timestamps and markdown formatting

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +6 -2
  3. main.py +54 -22
  4. setup.py +2 -2
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  venv
2
  transcript.txt
3
  notes.txt
 
4
  config.ini
 
1
  venv
2
  transcript.txt
3
  notes.txt
4
+ notes.md
5
  config.ini
README.md CHANGED
@@ -1,8 +1,12 @@
1
- # Transcript Notetaker
2
 
3
  ## Description
4
 
5
- A python script designed to create relevant notes from a transcript of a meeting.
 
 
 
 
6
 
7
  You will need an OpenAI API key to use this project.
8
 
 
1
+ # Google Meet Transcript AI Notes
2
 
3
  ## Description
4
 
5
+ A python script designed to create relevant notes from a transcript of a Google Meet meeting. Currently, with the proper,
6
+ options configured, Google Meet will automatically create an AI transcript of your meetings which is saved to Google Drive.
7
+ Often it is more useful to see just the notes from a meeting rather than the full transcript. This script uses OpenAI
8
+ prompts to create a detailed summary of the meeting from the transcript, as if it was taking notes in real time during
9
+ the meeting.
10
 
11
  You will need an OpenAI API key to use this project.
12
 
main.py CHANGED
@@ -2,10 +2,10 @@
2
  import configparser
3
  import os
4
  import time
 
5
  # 3rd party
6
  from langchain.llms import OpenAI
7
  from langchain import LLMChain
8
- from langchain.document_loaders import UnstructuredFileLoader
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain import PromptTemplate
11
 
@@ -13,19 +13,18 @@ from langchain import PromptTemplate
13
  config = configparser.ConfigParser()
14
  config.read('config.ini')
15
 
16
- def summarize_chunks(chunks):
17
- number_of_chunks = len(chunks)
18
- print(f"Summarizing: {number_of_chunks} chunks")
19
- chunk_summaries = []
20
- start_time = time.time()
21
- for i, chunk in enumerate(chunks, 1):
22
- chunk_summaries.append(chain.run(chunk))
23
- # info
24
- elapsed_time = time.time() - start_time
25
- minutes = elapsed_time // 60
26
- print(f"Completed Summary {i}/{number_of_chunks}, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
27
 
28
- return chunk_summaries
 
 
 
 
 
 
 
 
 
 
29
 
30
 
31
  if __name__ == '__main__':
@@ -37,13 +36,16 @@ if __name__ == '__main__':
37
 
38
  llm = OpenAI(temperature=0)
39
 
40
- loader = UnstructuredFileLoader(transcript_filepath)
41
- transcript = loader.load()
42
 
43
- # Split the text into smaller chunks that can be processed by the AI
44
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
45
- chunks = text_splitter.split_documents(transcript)
46
 
 
 
 
 
47
  prompt = PromptTemplate(
48
  template="Write a concise summary of the following: {transcript}",
49
  input_variables=['transcript']
@@ -55,9 +57,39 @@ if __name__ == '__main__':
55
  verbose=False
56
  )
57
 
58
- summaries = summarize_chunks(chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- meeting_notes = ''.join([summary for summary in summaries])
 
 
 
 
 
 
 
 
 
 
61
 
62
- with open(notes_filepath, 'w') as f:
63
- f.write(meeting_notes)
 
2
  import configparser
3
  import os
4
  import time
5
+ import re
6
  # 3rd party
7
  from langchain.llms import OpenAI
8
  from langchain import LLMChain
 
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain import PromptTemplate
11
 
 
13
  config = configparser.ConfigParser()
14
  config.read('config.ini')
15
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def load_transcript(path: str):
18
+ # Google Meet Transcripts have a header which we don't want to be summarized
19
+ header_lines = 5
20
+
21
+ with open(path, 'r') as input_file:
22
+ file_text = input_file.readlines()
23
+
24
+ head = file_text[:header_lines]
25
+ transcript = "".join(file_text[header_lines:])
26
+
27
+ return head, transcript
28
 
29
 
30
  if __name__ == '__main__':
 
36
 
37
  llm = OpenAI(temperature=0)
38
 
39
+ head, transcript = load_transcript(transcript_filepath)
 
40
 
41
+ # split the transcript on the 5-min timestamps
42
+ regex_pattern = r"[0-9]{2}:[0-9]{2}:[0-9]{2}"
43
+ five_min_chunks = re.split(regex_pattern, transcript)
44
 
45
+ # create a textsplitter to subdivide those chunks into appropriately sized chunks.
46
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
47
+
48
+ # prompt
49
  prompt = PromptTemplate(
50
  template="Write a concise summary of the following: {transcript}",
51
  input_variables=['transcript']
 
57
  verbose=False
58
  )
59
 
60
+ # list the meeting time and the chunks associated with it
61
+ timestamped_summaries = []
62
+
63
+ print(f"Summarizing {len(five_min_chunks)*5} minute meeting")
64
+ start_time = time.time()
65
+ # summarize the
66
+ for i, five_minutes_chunk in enumerate(five_min_chunks):
67
+ timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
68
+ sub_chunks = text_splitter.split_text(five_minutes_chunk)
69
+
70
+ summaries = []
71
+ for j, chunk in enumerate(sub_chunks):
72
+ summaries.append(chain.run(chunk))
73
+ print(f"{timestamp}: Chunk {j}/{len(sub_chunks)}")
74
+
75
+ timestamped_summaries.append((timestamp, summaries))
76
+
77
+ elapsed_time = time.time() - start_time
78
+ minutes = elapsed_time // 60
79
+ print(f"Summarized first {5 * (i+1)} minutes of meeting, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
80
+
81
+ first_line = re.split(r"[()]", head[0])
82
 
83
+ # Write summaries to file
84
+ with open(notes_filepath, 'w+') as f:
85
+ f.write(f"# {first_line[0]}\n")
86
+ f.write(f"{first_line[1]}\n")
87
+ f.write("## Attendees\n")
88
+ f.write(f"{head[2]}\n")
89
+ f.write('## Meeting Notes\n')
90
+ for timestamp, summaries in timestamped_summaries:
91
+ f.write(f"### {timestamp}\n")
92
+ for summary in summaries:
93
+ f.write(f"- {summary.strip()}\n")
94
 
95
+ print(f"Export to file {notes_filepath} completed")
 
setup.py CHANGED
@@ -19,8 +19,8 @@ config['REQUIRED'] = {
19
  # Optional
20
  config['OPTIONAL'] = {
21
  'transcript-filepath': 'transcript.txt',
22
- 'notes-filepath': 'notes.txt'
23
  }
24
 
25
  with open('config.ini', 'w') as configfile:
26
- config.write(configfile)
 
19
  # Optional
20
  config['OPTIONAL'] = {
21
  'transcript-filepath': 'transcript.txt',
22
+ 'notes-filepath': 'notes.md'
23
  }
24
 
25
  with open('config.ini', 'w') as configfile:
26
+ config.write(configfile)