from fastapi import File, Form, HTTPException, Body, UploadFile from numpy import clip from pydantic import BaseModel, Field from fastapi.responses import StreamingResponse from modules.api.impl.handler.TTSHandler import TTSHandler from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig from modules.api.impl.model.enhancer_model import EnhancerConfig from typing import List, Optional from modules.api import utils as api_utils from modules.api.Api import APIManager from modules.speaker import Speaker, speaker_mgr from modules.data import styles_mgr class AudioSpeechRequest(BaseModel): input: str # 需要合成的文本 model: str = "chattts-4w" voice: str = "female2" response_format: AudioFormat = "mp3" speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio") seed: int = 42 temperature: float = 0.3 top_k: int = 20 top_p: float = 0.7 style: str = "" batch_size: int = Field(1, ge=1, le=20, description="Batch size") spliter_threshold: float = Field( 100, ge=10, le=1024, description="Threshold for sentence spliter" ) # end of sentence eos: str = "[uv_break]" enhance: bool = False denoise: bool = False async def openai_speech_api( request: AudioSpeechRequest = Body( ..., description="JSON body with model, input text, and voice" ) ): model = request.model input_text = request.input voice = request.voice style = request.style eos = request.eos seed = request.seed response_format = request.response_format if not isinstance(response_format, AudioFormat) and isinstance( response_format, str ): response_format = AudioFormat(response_format) batch_size = request.batch_size spliter_threshold = request.spliter_threshold speed = request.speed speed = clip(speed, 0.1, 10) if not input_text: raise HTTPException(status_code=400, detail="Input text is required.") if speaker_mgr.get_speaker(voice) is None: raise HTTPException(status_code=400, detail="Invalid voice.") try: if style: styles_mgr.find_item_by_name(style) except: raise HTTPException(status_code=400, detail="Invalid style.") ctx_params = api_utils.calc_spk_style(spk=voice, style=style) speaker = ctx_params.get("spk") if not isinstance(speaker, Speaker): raise HTTPException(status_code=400, detail="Invalid voice.") tts_config = ChatTTSConfig( style=style, temperature=request.temperature, top_k=request.top_k, top_p=request.top_p, ) infer_config = InferConfig( batch_size=batch_size, spliter_threshold=spliter_threshold, eos=eos, seed=seed, ) adjust_config = AdjustConfig(speaking_rate=speed) enhancer_config = EnhancerConfig( enabled=request.enhance or request.denoise or False, lambd=0.9 if request.denoise else 0.1, ) try: handler = TTSHandler( text_content=input_text, spk=speaker, tts_config=tts_config, infer_config=infer_config, adjust_config=adjust_config, enhancer_config=enhancer_config, ) buffer = handler.enqueue_to_buffer(response_format) mime_type = f"audio/{response_format.value}" if response_format == AudioFormat.mp3: mime_type = "audio/mpeg" return StreamingResponse(buffer, media_type=mime_type) except Exception as e: import logging logging.exception(e) if isinstance(e, HTTPException): raise e else: raise HTTPException(status_code=500, detail=str(e)) class TranscribeSegment(BaseModel): id: int seek: float start: float end: float text: str tokens: list[int] temperature: float avg_logprob: float compression_ratio: float no_speech_prob: float class TranscriptionsVerboseResponse(BaseModel): task: str language: str duration: float text: str segments: list[TranscribeSegment] def setup(app: APIManager): app.post( "/v1/audio/speech", description=""" openai api document: [https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech) 以下属性为本系统自定义属性,不在openai文档中: - batch_size: 是否开启batch合成,小于等于1表示不使用batch (不推荐) - spliter_threshold: 开启batch合成时,句子分割的阈值 - style: 风格 > model 可填任意值 """, )(openai_speech_api) @app.post( "/v1/audio/transcriptions", response_model=TranscriptionsVerboseResponse, description="Transcribes audio into the input language.", ) async def transcribe( file: UploadFile = File(...), model: str = Form(...), language: Optional[str] = Form(None), prompt: Optional[str] = Form(None), response_format: str = Form("json"), temperature: float = Form(0), timestamp_granularities: List[str] = Form(["segment"]), ): # TODO: Implement transcribe return api_utils.success_response("not implemented yet")