Spaces:

CristianMongar
/

Audio_a_texto

Running

App Files Files Community

Mongar28 commited on May 27

Commit

bd942b6

•

1 Parent(s): ae1388e

Se integra Whisper opensource

Browse files

Files changed (7) hide show

.gitignore +4 -0
app.py +18 -2
openai_models/__init__.py +0 -0
openai_models/whisper.py +37 -0
requirements.txt +81 -0
streamlit_tools/__init__.py +0 -0
streamlit_tools/tools.py +41 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.env
+.venv
+openai_models/__pycache__
+streamlit_tools/__pycache__

app.py CHANGED Viewed

@@ -1,4 +1,20 @@
 import streamlit as st
-archivo_audio = st.file_uploader('Arrastra o ingresa tu archivo .mp3, .ma4, .ogg, .aac', type=[
-                                 '.mp3', '.m4a', '.ogg', '.aac'])

 import streamlit as st
+from streamlit_tools.tools import load_audio_file
+from openai_models.whisper import whisper_os
+import time
+def main():
+    audio_full_path: str = load_audio_file()
+    if audio_full_path:
+        transcription = whisper_os(audio_full_path)
+        def transcription_generator():
+            for word in transcription.split(' '):
+                time.sleep(0.2)
+                yield word + ' '
+        st.write_stream(transcription_generator(), )
+if __name__ == "__main__":
+    main()

openai_models/__init__.py ADDED Viewed

File without changes

openai_models/whisper.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers import pipeline, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq, AutoProcessor
+import torch
+def whisper_os(audio_full_path: str):
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    assistant_model_id = "distil-whisper/distil-large-v3"
+    assistant_model = AutoModelForCausalLM.from_pretrained(
+        assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+    )
+    assistant_model.to(device)
+    model_id = "openai/whisper-large-v3"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True,
+        use_safetensors=True, attn_implementation="sdpa")
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        max_new_tokens=128,
+        generate_kwargs={"assistant_model": assistant_model},
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    result = pipe(audio_full_path, return_timestamps=True,
+                  generate_kwargs={"language": "spanish"})
+    return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+accelerate==0.30.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+attrs==23.2.0
+blinker==1.8.2
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+distro==1.9.0
+filelock==3.14.0
+fsspec==2024.5.0
+gitdb==4.0.11
+GitPython==3.1.43
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.23.1
+idna==3.7
+Jinja2==3.1.4
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+lxml==5.2.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+openai==1.30.3
+packaging==24.0
+pandas==2.2.2
+pillow==10.3.0
+protobuf==4.25.3
+psutil==5.9.8
+pyarrow==16.1.0
+pydantic==2.7.1
+pydantic_core==2.18.2
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.2
+rich==13.7.1
+rpds-py==0.18.1
+safetensors==0.4.3
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+streamlit==1.35.0
+sympy==1.12
+tenacity==8.3.0
+tokenizers==0.19.1
+toml==0.10.2
+toolz==0.12.1
+torch==2.3.0
+tornado==6.4
+tqdm==4.66.4
+transformers==4.41.1
+typing_extensions==4.12.0
+tzdata==2024.1
+urllib3==2.2.1
+watchdog==4.0.1

streamlit_tools/__init__.py ADDED Viewed

File without changes

streamlit_tools/tools.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import streamlit as st
+import os
+def load_audio_file() -> str:
+    """
+    Uploads an audio file provided by the user and saves it in the
+    specified directory.
+    Parameters:
+    None
+    Returns:
+    str
+    """
+    st.markdown('# **Pastora**')
+    st.markdown('### *Transcripción de audio a texto*')
+    audio_file = st.file_uploader("Drag your audio file", type=[
+                                  '.mp3', '.m4a', '.ogg', '.aac'])
+    path_audio: str = os.path.join("documents", "audios")
+    if audio_file is not None:
+        if "audio_file_name" not in st.session_state.keys():
+            st.session_state.audio_file_name = audio_file.name
+        # Ensure the directory exists
+        os.makedirs(path_audio, exist_ok=True)
+        # Construct the full path to the new file
+        audio_full_path = os.path.join(
+            path_audio, st.session_state.audio_file_name)
+        # if "audio_full_path" not in st.session_state.key():
+        #     st.session_state.audio_full_path = os.path.join(
+        #         path_audio, st.session_state.audio_file_name)
+        with open(audio_full_path, 'wb') as new_file:
+            new_file.write(audio_file.read())
+        return audio_full_path