Spaces:

ericbotti
/

transcript-notetaker

Runtime error

App Files Files Community

Eric Botti commited on Jul 5, 2023

Commit

8aa24e3

1 Parent(s): ea15890

timestamps and markdown formatting

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +6 -2
main.py +54 -22
setup.py +2 -2

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 venv
 transcript.txt
 notes.txt
 config.ini

 venv
 transcript.txt
 notes.txt
+notes.md
 config.ini

README.md CHANGED Viewed

@@ -1,8 +1,12 @@
-# Transcript Notetaker
 ## Description
-A python script designed to create relevant notes from a transcript of a meeting.
 You will need an OpenAI API key to use this project.

+# Google Meet Transcript AI Notes
 ## Description
+A python script designed to create relevant notes from a transcript of a Google Meet meeting. Currently, with the proper,
+options configured, Google Meet will automatically create an AI transcript of your meetings which is saved to Google Drive.
+Often it is more useful to see just the notes from a meeting rather than the full transcript. This script uses OpenAI
+prompts to create a detailed summary of the meeting from the transcript, as if it was taking notes in real time during
+the meeting.
 You will need an OpenAI API key to use this project.

main.py CHANGED Viewed

@@ -2,10 +2,10 @@
 import configparser
 import os
 import time
 # 3rd party
 from langchain.llms import OpenAI
 from langchain import LLMChain
-from langchain.document_loaders import UnstructuredFileLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain import PromptTemplate
@@ -13,19 +13,18 @@ from langchain import PromptTemplate
 config = configparser.ConfigParser()
 config.read('config.ini')
-def summarize_chunks(chunks):
-    number_of_chunks = len(chunks)
-    print(f"Summarizing: {number_of_chunks} chunks")
-    chunk_summaries = []
-    start_time = time.time()
-    for i, chunk in enumerate(chunks, 1):
-        chunk_summaries.append(chain.run(chunk))
-        # info
-        elapsed_time = time.time() - start_time
-        minutes = elapsed_time // 60
-        print(f"Completed Summary {i}/{number_of_chunks}, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
-    return chunk_summaries
 if __name__ == '__main__':
@@ -37,13 +36,16 @@ if __name__ == '__main__':
     llm = OpenAI(temperature=0)
-    loader = UnstructuredFileLoader(transcript_filepath)
-    transcript = loader.load()
-    # Split the text into smaller chunks that can be processed by the AI
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
-    chunks = text_splitter.split_documents(transcript)
     prompt = PromptTemplate(
         template="Write a concise summary of the following: {transcript}",
         input_variables=['transcript']
@@ -55,9 +57,39 @@ if __name__ == '__main__':
         verbose=False
     )
-    summaries = summarize_chunks(chunks)
-    meeting_notes = ''.join([summary for summary in summaries])
-    with open(notes_filepath, 'w') as f:
-        f.write(meeting_notes)

 import configparser
 import os
 import time
+import re
 # 3rd party
 from langchain.llms import OpenAI
 from langchain import LLMChain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain import PromptTemplate
 config = configparser.ConfigParser()
 config.read('config.ini')
+def load_transcript(path: str):
+    # Google Meet Transcripts have a header which we don't want to be summarized
+    header_lines = 5
+    with open(path, 'r') as input_file:
+        file_text = input_file.readlines()
+    head = file_text[:header_lines]
+    transcript = "".join(file_text[header_lines:])
+    return head, transcript
 if __name__ == '__main__':
     llm = OpenAI(temperature=0)
+    head, transcript = load_transcript(transcript_filepath)
+    # split the transcript on the 5-min timestamps
+    regex_pattern = r"[0-9]{2}:[0-9]{2}:[0-9]{2}"
+    five_min_chunks = re.split(regex_pattern, transcript)
+    # create a textsplitter to subdivide those chunks into appropriately sized chunks.
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
+    # prompt
     prompt = PromptTemplate(
         template="Write a concise summary of the following: {transcript}",
         input_variables=['transcript']
         verbose=False
     )
+    # list the meeting time and the chunks associated with it
+    timestamped_summaries = []
+    print(f"Summarizing {len(five_min_chunks)*5} minute meeting")
+    start_time = time.time()
+    # summarize the
+    for i, five_minutes_chunk in enumerate(five_min_chunks):
+        timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
+        sub_chunks = text_splitter.split_text(five_minutes_chunk)
+        summaries = []
+        for j, chunk in enumerate(sub_chunks):
+            summaries.append(chain.run(chunk))
+            print(f"{timestamp}: Chunk {j}/{len(sub_chunks)}")
+        timestamped_summaries.append((timestamp, summaries))
+        elapsed_time = time.time() - start_time
+        minutes = elapsed_time // 60
+        print(f"Summarized first {5 * (i+1)} minutes of meeting, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
+    first_line = re.split(r"[()]", head[0])
+    # Write summaries to file
+    with open(notes_filepath, 'w+') as f:
+        f.write(f"# {first_line[0]}\n")
+        f.write(f"{first_line[1]}\n")
+        f.write("## Attendees\n")
+        f.write(f"{head[2]}\n")
+        f.write('## Meeting Notes\n')
+        for timestamp, summaries in timestamped_summaries:
+            f.write(f"### {timestamp}\n")
+            for summary in summaries:
+                f.write(f"- {summary.strip()}\n")
+    print(f"Export to file {notes_filepath} completed")

setup.py CHANGED Viewed

@@ -19,8 +19,8 @@ config['REQUIRED'] = {
 # Optional
 config['OPTIONAL'] = {
     'transcript-filepath': 'transcript.txt',
-    'notes-filepath': 'notes.txt'
 }
 with open('config.ini', 'w') as configfile:
-    config.write(configfile)

 # Optional
 config['OPTIONAL'] = {
     'transcript-filepath': 'transcript.txt',
+    'notes-filepath': 'notes.md'
 }
 with open('config.ini', 'w') as configfile:
+    config.write(configfile)