Spaces:

ericbotti
/

transcript-notetaker

Runtime error

File size: 4,818 Bytes

9b23edc

# built in
from io import StringIO
import re
import time
# 3rd party - located in requirements.txt
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai

HEADER_SIZE = 5 # number of lines in the transcript header
CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized
TEMPERATURE = 0


def load_transcript(input_file):
    """Load the text from the transcript uploaded using the file uploader widget"""
    # transform file from bytes to string
    input_string = StringIO(input_file.getvalue().decode('UTF-8'))

    # Google Meet Transcripts have a header with info like the meeting title, date, and attendees
    # We'll want to extract this information separately, instead of having it passed to a summarizer

    file_text = input_string.readlines()

    header = file_text[:HEADER_SIZE]
    transcript = "".join(file_text[HEADER_SIZE:])

    return header, transcript


def chunk_transcript(transcript: str):
    # Google Meet transcripts show the timestamp every 5 minutes
    # split the transcript on the 5-min timestamps
    timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
    five_minute_chunks = re.split(timestamp_regex_pattern, transcript)

    # create a textsplitter to subdivide those chunks into appropriately sized chunks.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)

    # for each 5 minute chunk divide further into sub-chunks of appropriate length
    chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks]

    # chunks, is a list of lists
    # outer list represents 5-minute sections of the meeting
    # inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly

    return chunks


def summarize_chunks(five_minute_chunks, user_api_key, debug = False):
    """Create summaries of each chunk of the transcript"""

    system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines:
    1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
    2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
    3. Rely strictly on the provided text, without including external information.
    4. Format the summary in paragraph form for easy understanding.
    5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase
    '''

    total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks])
    number_of_summarized_chunks = 0

    progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")

    five_minute_summaries = []
    for sub_chunks in five_minute_chunks:
        summaries = []
        for chunk in sub_chunks:
            if not debug:
                messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": chunk}
                ]

                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    temperature=TEMPERATURE,
                    api_key=user_api_key
                )

                summary = response['choices'][0]['message']['content']
            else:
                summary = "I would be a meeting note :D"

            # update progress bar
            number_of_summarized_chunks += 1
            progress_bar.progress(number_of_summarized_chunks / total_chunks,
                                  f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")

            summaries.append(summary)

        five_minute_summaries.append(summaries)

    return five_minute_summaries


def format_notes(big_summaries, header):
    """Create a string containing the meeting notes in Markdown format"""
    # The header of Google Meet transcripts are always the same structure, so we can manually extract info from them
    first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date
    meeting_name = first_line[0]
    meeting_date = first_line[1]
    attendees = header[2]

    meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n"

    for i, summaries in enumerate(big_summaries):
        timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))

        meeting_notes += f"### {timestamp}\n"
        for summary in summaries:
            meeting_notes += f"- {summary.strip()}\n"

    return meeting_notes