Spaces:

CineAI
/

Chelsea

Sleeping

File size: 3,542 Bytes

e294914
cab263c
d022aab
7c29ee2
1581bbf
7378fc8
 
e294914
 
d5436e0
 
 
f8dfb0f
d63135e
e294914
7378fc8
e294914
 
 
7378fc8
e294914
 
 
d022aab
7378fc8
 
66a2c12
d022aab
7c29ee2
 
7378fc8
7c29ee2
 
7378fc8
 
 
 
 
80a60df
7378fc8
 
 
 
 
 
91b59ba
7378fc8
 
 
 
 
 
 
 
 
91b59ba
7378fc8
 
91b59ba
7c29ee2
5a785fa
e294914
 
3e2a726
5a785fa
3e2a726
 
 
cf08317
e294914
5a785fa
 
 
 
 
 
 
3e2a726
 
7378fc8
 
 
e294914
7378fc8
cf08317
7378fc8
3e2a726
e294914
0f596d3
e294914
5a785fa
e294914
cf08317
5a785fa
 
 
cf08317
66a2c12
 
5a785fa
 
4affef3
 
5a785fa

# version - ArcticMonkeys:30.07.24

# python core libraries
import re
import psutil
import time
import random
# streamlit
import streamlit as st
# components from other authors
from streamlit_mic_recorder import mic_recorder
# core modules
from audio_processing.A2T import A2T
from audio_processing.T2A import T2A
from llm.utils.chat import Conversation
from vlm.vlm import VLM
# utils modules
from utils.keywords import keywords
from utils.prompt_toggle import select_prompt, load_prompts
from utils.image_caption import ImageCaption

prompts = load_prompts()
chat = Conversation()
t2a = T2A()
vlm = VLM()
ic = ImageCaption()
layer_text = ""

def remove_labels_with_regex(text: str):
    pattern = r'^(Human:|AI:|Chelsea:)\s*'
    cleaned_text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return cleaned_text

def exctrator(sentence, phrase="show me your image"):
    extracted_text = sentence.split(phrase)[1].strip() if phrase in sentence else ""
    return extracted_text

def switching(text):
    command = re.search("show me your image", text.lower(), re.IGNORECASE) if text is not None else "Error because your voice requst is None"
    result = None

    if command:
        prompt = exctrator(text.lower())
        # Завантажуємо зображення
        uploaded_image = ic.load_image()

        if uploaded_image is not None:
            # Якщо зображення завантажено, виконуємо обробку
            result = ic.send2ai(model=vlm, prompt=prompt)
        else:
            # Якщо зображення ще не завантажене, показуємо попередження
            st.warning("No image uploaded yet. Please upload an image to continue.")
    else:
        prompt = select_prompt(input_text=text, prompts=prompts, keywords=keywords)
        result = chat.chatting(prompt=prompt if prompt is not None else text)

    print(f"Prompt:\n{prompt}")
    return result


def get_text():
    try:
        mic = mic_recorder(start_prompt="Record", stop_prompt="Stop", just_once=True, use_container_width=True)
        start_time = time.perf_counter()
        a2t = A2T(mic["bytes"])
        text = a2t.predict()
        print(f"Text from A2T:\n{text}")
        execution_time = time.perf_counter() - start_time
        print(f"App.py -> get_text() -> time of execution A2T -> {execution_time}s")

        return text
    except Exception as e:
        print(f"An error occurred in get_text function, reasone is: {e}")


def speaking(text):
    try:
        if text and text.strip() != "":
            print(f"Checking for execution this part {random.randint(0, 5)}")
            output = switching(text)
            response = remove_labels_with_regex(text=output)
            start_time_t2a = time.perf_counter()
            t2a.autoplay(response)
            execution_time_t2a = time.perf_counter() - start_time_t2a
            print(f"App.py -> speaking() -> time of execution T2A -> {execution_time_t2a}s")
            print(ic.pil_image)
    
            if response:
                st.markdown(f"Your input: {text}")
                st.markdown(f"Chelsea response: {response}")

    except Exception as e:
        print(f"An error occurred in speaking function, reasone is: {e}")

def main():
    text = get_text()
    print(f"Print text: s{text}s")
    layer_text = text
    print(f"Print text: s{layer_text}s")
    speaking(text)
    print(f"Checking for execution main func {random.randint(0, 10)}")

if __name__ == "__main__":
    main()