Spaces:
Paused
Paused
gamingflexer
commited on
Commit
·
3dfde99
1
Parent(s):
d8ba40f
files added
Browse files- app.py +116 -0
- requirements.txt +7 -0
- summarizer.py +101 -0
app.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import gradio as gr
|
3 |
+
import os
|
4 |
+
import zipfile
|
5 |
+
import pydub
|
6 |
+
import datetime
|
7 |
+
|
8 |
+
import openai
|
9 |
+
import jwt
|
10 |
+
|
11 |
+
from summarizer import count_tokens,main_summarizer_action_items,main_summarizer_meet
|
12 |
+
from decouple import config
|
13 |
+
|
14 |
+
DEBUG = True
|
15 |
+
API_KEY = config('API_KEY')
|
16 |
+
model_id = 'whisper-1'
|
17 |
+
SECRET_KEY = "$§%§$secret"
|
18 |
+
|
19 |
+
# Set the summarization parameters
|
20 |
+
# Set the maximum chunk size and tokens per chunk
|
21 |
+
max_chunk_size = 2000
|
22 |
+
max_tokens_per_chunk = 500
|
23 |
+
temperature = 0.7
|
24 |
+
top_p = 0.5
|
25 |
+
frequency_penalty = 0.5
|
26 |
+
temp_dir = os.path.join(os.path.dirname(__file__), 'temp')
|
27 |
+
|
28 |
+
title = description = article = "Meeting Summariser ⚡️ "
|
29 |
+
|
30 |
+
logger = logging.getLogger("Summariser")
|
31 |
+
logger.setLevel(logging.INFO)
|
32 |
+
ch = logging.StreamHandler()
|
33 |
+
ch.setLevel(logging.INFO)
|
34 |
+
formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(message)s", "%Y-%m-%d %H:%M:%S")
|
35 |
+
ch.setFormatter(formatter)
|
36 |
+
logger.addHandler(ch)
|
37 |
+
|
38 |
+
def authentication(username, password):
|
39 |
+
if username == "admin" and password == "admin":
|
40 |
+
return True
|
41 |
+
|
42 |
+
|
43 |
+
def transcribe_audio(audio_file_path, temp_folder_path):
|
44 |
+
if DEBUG:
|
45 |
+
return "This is a test transcription"
|
46 |
+
|
47 |
+
max_size_bytes = 20 * 1024 * 1024 # 24 MB
|
48 |
+
|
49 |
+
if os.path.getsize(audio_file_path) <= max_size_bytes:
|
50 |
+
media_file = open(audio_file_path, 'rb')
|
51 |
+
response = openai.Audio.transcribe(
|
52 |
+
api_key=API_KEY,
|
53 |
+
model=model_id,
|
54 |
+
file=media_file
|
55 |
+
)
|
56 |
+
return response['text']
|
57 |
+
else:
|
58 |
+
sound = pydub.AudioSegment.from_file(audio_file_path, format="mp3")
|
59 |
+
chunks = pydub.utils.make_chunks(sound, max_size_bytes)
|
60 |
+
transcriptions = []
|
61 |
+
for i, chunk in enumerate(chunks):
|
62 |
+
print("chunk ", i)
|
63 |
+
chunk_path = os.path.join(temp_folder_path, f"audio_chunk_{i}.mp3")
|
64 |
+
chunk.export(chunk_path, format="mp3")
|
65 |
+
response = openai.Audio.transcribe(api_key=API_KEY,model=model_id,file=open(chunk_path, 'rb'))
|
66 |
+
transcriptions.append(response['text'])
|
67 |
+
|
68 |
+
return ' '.join(transcriptions)
|
69 |
+
|
70 |
+
def download_files(transcription: str, summary: str):
|
71 |
+
time_now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
72 |
+
# Create transcription file
|
73 |
+
transcript_file_path = os.path.join(temp_dir, f'transcription_{time_now}.txt')
|
74 |
+
with open(transcript_file_path, 'w') as f:
|
75 |
+
f.write(transcription)
|
76 |
+
# Create summary file
|
77 |
+
summary_file_path = os.path.join(temp_dir, f'summary_{time_now}.txt')
|
78 |
+
with open(summary_file_path, 'w') as f:
|
79 |
+
f.write(summary)
|
80 |
+
# Create zip file
|
81 |
+
zip_file_path = os.path.join(temp_dir, 'download.zip')
|
82 |
+
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
83 |
+
# Add transcription file to zip
|
84 |
+
zip_file.write(transcript_file_path, 'transcription.txt')
|
85 |
+
# Add summary file to zip
|
86 |
+
zip_file.write(summary_file_path, 'summary.txt')
|
87 |
+
return zip_file_path
|
88 |
+
|
89 |
+
def clean_trancript(text):
|
90 |
+
return text
|
91 |
+
|
92 |
+
def main_meet_summarizer(audio_file):
|
93 |
+
|
94 |
+
summary = ""
|
95 |
+
transcript = ""
|
96 |
+
action_items = ""
|
97 |
+
|
98 |
+
print("Starting Transcription")
|
99 |
+
transcript = transcribe_audio(audio_file,temp_dir)
|
100 |
+
print(f"Starting Summarization | {count_tokens(transcript)}")
|
101 |
+
cleaned_transcript = clean_trancript(transcript)
|
102 |
+
summary = main_summarizer_meet(cleaned_transcript, debug=DEBUG)
|
103 |
+
action_items = main_summarizer_action_items(cleaned_transcript, debug=DEBUG)
|
104 |
+
print("Finished Summarization")
|
105 |
+
return summary,transcript,download_files(transcription = transcript, summary = (summary + action_items))
|
106 |
+
|
107 |
+
|
108 |
+
summarizer_interface = gr.Interface(
|
109 |
+
fn=main_meet_summarizer,
|
110 |
+
inputs=[gr.inputs.Audio(source='upload', type='filepath', label='Audio File')],
|
111 |
+
outputs=[gr.outputs.Textbox(label='Summary'), gr.outputs.Textbox(label='Transcription'),gr.outputs.File(label="Download files here"),],
|
112 |
+
title='Summarizer',
|
113 |
+
description='Transcribe speech in an audio file & summarize it.',
|
114 |
+
)
|
115 |
+
|
116 |
+
summarizer_interface.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.28.3
|
2 |
+
openai
|
3 |
+
openai-async
|
4 |
+
asyncio
|
5 |
+
nest-asyncio
|
6 |
+
transformers==4.26.1
|
7 |
+
torch
|
summarizer.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai_async
|
2 |
+
import asyncio
|
3 |
+
import nest_asyncio
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
9 |
+
|
10 |
+
def count_tokens(text):
|
11 |
+
input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
|
12 |
+
return input_ids.shape[1]
|
13 |
+
|
14 |
+
def break_up_file_to_chunks(text, chunk_size=2000, overlap=100):
|
15 |
+
tokens = tokenizer.encode(text)
|
16 |
+
num_tokens = len(tokens)
|
17 |
+
chunks = []
|
18 |
+
for i in range(0, num_tokens, chunk_size - overlap):
|
19 |
+
chunk = tokens[i:i + chunk_size]
|
20 |
+
chunks.append(chunk)
|
21 |
+
|
22 |
+
return chunks
|
23 |
+
|
24 |
+
async def summarize_meeting(prompt, timeout, max_tokens):
|
25 |
+
|
26 |
+
#timeout = 30
|
27 |
+
temperature = 0.5
|
28 |
+
#max_tokens = 1000
|
29 |
+
top_p = 1
|
30 |
+
frequency_penalty = 0
|
31 |
+
presence_penalty = 0
|
32 |
+
|
33 |
+
# Call the OpenAI GPT-3 API
|
34 |
+
response = await openai_async.complete(
|
35 |
+
api_key = API_KEY,
|
36 |
+
timeout=timeout,
|
37 |
+
payload={
|
38 |
+
"model": "gpt-3.5-turbo",
|
39 |
+
"prompt": prompt,
|
40 |
+
"temperature": temperature,
|
41 |
+
"max_tokens": max_tokens,
|
42 |
+
"top_p": top_p,
|
43 |
+
"frequency_penalty": frequency_penalty,
|
44 |
+
"presence_penalty": presence_penalty
|
45 |
+
},
|
46 |
+
)
|
47 |
+
|
48 |
+
# Return the generated text
|
49 |
+
return response
|
50 |
+
|
51 |
+
def main_summarizer_meet(text, debug=False):
|
52 |
+
if debug:
|
53 |
+
return "This is a test summary function"
|
54 |
+
prompt_response = []
|
55 |
+
prompt_tokens = []
|
56 |
+
|
57 |
+
chunks = break_up_file_to_chunks(text)
|
58 |
+
|
59 |
+
for i, chunk in enumerate(chunks):
|
60 |
+
prompt_request = (
|
61 |
+
f"Summarize this meeting transcript: {tokenizer.decode(chunks[i])}"
|
62 |
+
)
|
63 |
+
|
64 |
+
loop = asyncio.new_event_loop()
|
65 |
+
asyncio.set_event_loop(loop)
|
66 |
+
|
67 |
+
response = loop.run_until_complete(summarize_meeting(prompt = prompt_request, timeout=30, max_tokens = 1000))
|
68 |
+
|
69 |
+
prompt_response.append(response.json()["choices"][0]["text"].strip())
|
70 |
+
prompt_tokens.append(response.json()["usage"]["total_tokens"])
|
71 |
+
|
72 |
+
prompt_request = f"Consoloidate these meeting summaries: {prompt_response}"
|
73 |
+
|
74 |
+
loop = asyncio.new_event_loop()
|
75 |
+
asyncio.set_event_loop(loop)
|
76 |
+
response = loop.run_until_complete(summarize_meeting(prompt = prompt_request, timeout=45, max_tokens = 1000))
|
77 |
+
return response.json()["choices"][0]["text"].strip()
|
78 |
+
|
79 |
+
# -----------------------------
|
80 |
+
|
81 |
+
def main_summarizer_action_items(text, debug=False):
|
82 |
+
|
83 |
+
if debug:
|
84 |
+
return "This is a test action items function"
|
85 |
+
|
86 |
+
action_response = []
|
87 |
+
action_tokens = []
|
88 |
+
|
89 |
+
chunks = break_up_file_to_chunks(text)
|
90 |
+
|
91 |
+
for i, chunk in enumerate(chunks):
|
92 |
+
prompt_request = f"Provide a list of action items with a due date from the provided meeting transcript text: {tokenizer.decode(chunks[i])}"
|
93 |
+
|
94 |
+
loop = asyncio.new_event_loop()
|
95 |
+
asyncio.set_event_loop(loop)
|
96 |
+
response = loop.run_until_complete(summarize_meeting(prompt = prompt_request, timeout=30, max_tokens = 1000))
|
97 |
+
|
98 |
+
action_response.append(response.json()["choices"][0]["text"].strip())
|
99 |
+
action_tokens.append(response.json()["usage"]["total_tokens"])
|
100 |
+
|
101 |
+
return '\n'.join(action_response)
|