# version - ArcticMonkeys:30.07.24 # python core libraries import re import psutil import time import random # streamlit import streamlit as st # components from other authors from streamlit_mic_recorder import mic_recorder # core modules from audio_processing.A2T import A2T from audio_processing.T2A import T2A from llm.utils.chat import Conversation from vlm.vlm import VLM # utils modules from utils.keywords import keywords from utils.prompt_toggle import select_prompt, load_prompts from utils.image_caption import ImageCaption prompts = load_prompts() chat = Conversation() t2a = T2A() vlm = VLM() ic = ImageCaption() def remove_labels_with_regex(text: str): pattern = r'^(Human:|AI:|Chelsea:)\s*' cleaned_text = re.sub(pattern, '', text, flags=re.MULTILINE) return cleaned_text def exctrator(sentence, phrase="show me your image"): extracted_text = sentence.split(phrase)[1].strip() if phrase in sentence else "" return extracted_text def switching(text): command = re.search("show me your image", text.lower(), re.IGNORECASE) if text is not None else "Error because your voice requst is None" result = None if command: prompt = exctrator(text.lower()) # Завантажуємо зображення uploaded_image = ic.load_image() if uploaded_image is not None: # Якщо зображення завантажено, виконуємо обробку result = ic.send2ai(model=vlm, prompt=prompt) else: # Якщо зображення ще не завантажене, показуємо попередження st.warning("No image uploaded yet. Please upload an image to continue.") else: prompt = select_prompt(input_text=text, prompts=prompts, keywords=keywords) result = chat.chatting(prompt=prompt if prompt is not None else text) print(f"Prompt:\n{prompt}") prompt = None return result def main(): try: mic = mic_recorder(start_prompt="Record", stop_prompt="Stop", just_once=True, use_container_width=True) if mic is not None: start_time = time.perf_counter() a2t = A2T(mic["bytes"]) text = a2t.predict() print(f"Text from A2T:\n{text}") execution_time = time.perf_counter() - start_time print(f"App.py -> main() -> time of execution A2T -> {execution_time}s") output = switching(text) response = remove_labels_with_regex(text=output) start_time_t2a = time.perf_counter() t2a.autoplay(response) execution_time_t2a = time.perf_counter() - start_time_t2a print(f"App.py -> main() -> time of execution T2A -> {execution_time_t2a}s") print(ic.pil_image) if response: st.markdown(f"Your input: {text}") st.markdown(f"Chelsea response: {response}") response = None except Exception as e: print(f"An error occurred in main finction, reasone is: {e}") if __name__ == "__main__": main() footer=""" """ # st.markdown(footer,unsafe_allow_html=True)