Mongar28 commited on
Commit
bd942b6
1 Parent(s): ae1388e

Se integra Whisper opensource

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ .venv
3
+ openai_models/__pycache__
4
+ streamlit_tools/__pycache__
app.py CHANGED
@@ -1,4 +1,20 @@
1
  import streamlit as st
 
 
 
2
 
3
- archivo_audio = st.file_uploader('Arrastra o ingresa tu archivo .mp3, .ma4, .ogg, .aac', type=[
4
- '.mp3', '.m4a', '.ogg', '.aac'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from streamlit_tools.tools import load_audio_file
3
+ from openai_models.whisper import whisper_os
4
+ import time
5
 
6
+
7
+ def main():
8
+ audio_full_path: str = load_audio_file()
9
+ if audio_full_path:
10
+ transcription = whisper_os(audio_full_path)
11
+
12
+ def transcription_generator():
13
+ for word in transcription.split(' '):
14
+ time.sleep(0.2)
15
+ yield word + ' '
16
+ st.write_stream(transcription_generator(), )
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
openai_models/__init__.py ADDED
File without changes
openai_models/whisper.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq, AutoProcessor
2
+ import torch
3
+
4
+
5
+ def whisper_os(audio_full_path: str):
6
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+
9
+ assistant_model_id = "distil-whisper/distil-large-v3"
10
+
11
+ assistant_model = AutoModelForCausalLM.from_pretrained(
12
+ assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
13
+ )
14
+ assistant_model.to(device)
15
+
16
+ model_id = "openai/whisper-large-v3"
17
+
18
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
19
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True,
20
+ use_safetensors=True, attn_implementation="sdpa")
21
+
22
+ processor = AutoProcessor.from_pretrained(model_id)
23
+
24
+ pipe = pipeline(
25
+ "automatic-speech-recognition",
26
+ model=model,
27
+ tokenizer=processor.tokenizer,
28
+ feature_extractor=processor.feature_extractor,
29
+ max_new_tokens=128,
30
+ generate_kwargs={"assistant_model": assistant_model},
31
+ torch_dtype=torch_dtype,
32
+ device=device,
33
+ )
34
+
35
+ result = pipe(audio_full_path, return_timestamps=True,
36
+ generate_kwargs={"language": "spanish"})
37
+ return result
requirements.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.30.1
2
+ altair==5.3.0
3
+ annotated-types==0.7.0
4
+ anyio==4.4.0
5
+ attrs==23.2.0
6
+ blinker==1.8.2
7
+ cachetools==5.3.3
8
+ certifi==2024.2.2
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ distro==1.9.0
12
+ filelock==3.14.0
13
+ fsspec==2024.5.0
14
+ gitdb==4.0.11
15
+ GitPython==3.1.43
16
+ h11==0.14.0
17
+ httpcore==1.0.5
18
+ httpx==0.27.0
19
+ huggingface-hub==0.23.1
20
+ idna==3.7
21
+ Jinja2==3.1.4
22
+ jsonschema==4.22.0
23
+ jsonschema-specifications==2023.12.1
24
+ lxml==5.2.2
25
+ markdown-it-py==3.0.0
26
+ MarkupSafe==2.1.5
27
+ mdurl==0.1.2
28
+ mpmath==1.3.0
29
+ networkx==3.3
30
+ numpy==1.26.4
31
+ nvidia-cublas-cu12==12.1.3.1
32
+ nvidia-cuda-cupti-cu12==12.1.105
33
+ nvidia-cuda-nvrtc-cu12==12.1.105
34
+ nvidia-cuda-runtime-cu12==12.1.105
35
+ nvidia-cudnn-cu12==8.9.2.26
36
+ nvidia-cufft-cu12==11.0.2.54
37
+ nvidia-curand-cu12==10.3.2.106
38
+ nvidia-cusolver-cu12==11.4.5.107
39
+ nvidia-cusparse-cu12==12.1.0.106
40
+ nvidia-nccl-cu12==2.20.5
41
+ nvidia-nvjitlink-cu12==12.5.40
42
+ nvidia-nvtx-cu12==12.1.105
43
+ openai==1.30.3
44
+ packaging==24.0
45
+ pandas==2.2.2
46
+ pillow==10.3.0
47
+ protobuf==4.25.3
48
+ psutil==5.9.8
49
+ pyarrow==16.1.0
50
+ pydantic==2.7.1
51
+ pydantic_core==2.18.2
52
+ pydeck==0.9.1
53
+ Pygments==2.18.0
54
+ python-dateutil==2.9.0.post0
55
+ python-docx==1.1.2
56
+ python-dotenv==1.0.1
57
+ pytz==2024.1
58
+ PyYAML==6.0.1
59
+ referencing==0.35.1
60
+ regex==2024.5.15
61
+ requests==2.32.2
62
+ rich==13.7.1
63
+ rpds-py==0.18.1
64
+ safetensors==0.4.3
65
+ six==1.16.0
66
+ smmap==5.0.1
67
+ sniffio==1.3.1
68
+ streamlit==1.35.0
69
+ sympy==1.12
70
+ tenacity==8.3.0
71
+ tokenizers==0.19.1
72
+ toml==0.10.2
73
+ toolz==0.12.1
74
+ torch==2.3.0
75
+ tornado==6.4
76
+ tqdm==4.66.4
77
+ transformers==4.41.1
78
+ typing_extensions==4.12.0
79
+ tzdata==2024.1
80
+ urllib3==2.2.1
81
+ watchdog==4.0.1
streamlit_tools/__init__.py ADDED
File without changes
streamlit_tools/tools.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+
4
+
5
+ def load_audio_file() -> str:
6
+ """
7
+ Uploads an audio file provided by the user and saves it in the
8
+ specified directory.
9
+
10
+ Parameters:
11
+ None
12
+
13
+ Returns:
14
+ str
15
+ """
16
+ st.markdown('# **Pastora**')
17
+ st.markdown('### *Transcripción de audio a texto*')
18
+ audio_file = st.file_uploader("Drag your audio file", type=[
19
+ '.mp3', '.m4a', '.ogg', '.aac'])
20
+
21
+ path_audio: str = os.path.join("documents", "audios")
22
+
23
+ if audio_file is not None:
24
+ if "audio_file_name" not in st.session_state.keys():
25
+ st.session_state.audio_file_name = audio_file.name
26
+
27
+ # Ensure the directory exists
28
+ os.makedirs(path_audio, exist_ok=True)
29
+
30
+ # Construct the full path to the new file
31
+ audio_full_path = os.path.join(
32
+ path_audio, st.session_state.audio_file_name)
33
+
34
+ # if "audio_full_path" not in st.session_state.key():
35
+ # st.session_state.audio_full_path = os.path.join(
36
+ # path_audio, st.session_state.audio_file_name)
37
+
38
+ with open(audio_full_path, 'wb') as new_file:
39
+ new_file.write(audio_file.read())
40
+
41
+ return audio_full_path