ChatTTS-Forge

Sleeping

ChatTTS-Forge / modules /api /impl /google_api.py

zhzluke96

update

d2b7e94 6 months ago

5.6 kB

	from typing import Union

	from fastapi import HTTPException
	from pydantic import BaseModel

	from modules.api import utils as api_utils
	from modules.api.Api import APIManager
	from modules.api.impl.handler.SSMLHandler import SSMLHandler
	from modules.api.impl.handler.TTSHandler import TTSHandler
	from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
	from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
	from modules.api.impl.model.enhancer_model import EnhancerConfig
	from modules.speaker import Speaker, speaker_mgr


	class SynthesisInput(BaseModel):
	text: Union[str, None] = None
	ssml: Union[str, None] = None


	class VoiceSelectionParams(BaseModel):
	languageCode: str = "ZH-CN"

	name: str = "female2"
	style: str = ""
	temperature: float = 0.3
	topP: float = 0.7
	topK: int = 20
	seed: int = 42

	# end_of_sentence
	eos: str = "[uv_break]"


	class AudioConfig(BaseModel):
	audioEncoding: AudioFormat = AudioFormat.mp3
	speakingRate: float = 1
	pitch: float = 0
	volumeGainDb: float = 0
	sampleRateHertz: int = 24000
	batchSize: int = 4
	spliterThreshold: int = 100


	class GoogleTextSynthesizeRequest(BaseModel):
	input: SynthesisInput
	voice: VoiceSelectionParams
	audioConfig: AudioConfig
	enhancerConfig: EnhancerConfig = None


	class GoogleTextSynthesizeResponse(BaseModel):
	audioContent: str


	async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
	input = request.input
	voice = request.voice
	audioConfig = request.audioConfig
	enhancerConfig = request.enhancerConfig

	# 提取参数

	# TODO 这个也许应该传给 normalizer
	language_code = voice.languageCode
	voice_name = voice.name
	infer_seed = voice.seed or 42
	eos = voice.eos or "[uv_break]"
	audio_format = audioConfig.audioEncoding

	if not isinstance(audio_format, AudioFormat) and isinstance(audio_format, str):
	audio_format = AudioFormat(audio_format)

	speaking_rate = audioConfig.speakingRate or 1
	pitch = audioConfig.pitch or 0
	volume_gain_db = audioConfig.volumeGainDb or 0

	batch_size = audioConfig.batchSize or 1

	spliter_threshold = audioConfig.spliterThreshold or 100

	# TODO
	sample_rate = audioConfig.sampleRateHertz or 24000

	params = api_utils.calc_spk_style(spk=voice.name, style=voice.style)

	# 虽然 calc_spk_style 可以解析 seed 形式，但是这个接口只准备支持 speakers list 中存在的 speaker
	if speaker_mgr.get_speaker(voice_name) is None:
	raise HTTPException(
	status_code=422, detail="The specified voice name is not supported."
	)

	if not isinstance(params.get("spk"), Speaker):
	raise HTTPException(
	status_code=422, detail="The specified voice name is not supported."
	)

	speaker = params.get("spk")
	tts_config = ChatTTSConfig(
	style=params.get("style", ""),
	temperature=voice.temperature,
	top_k=voice.topK,
	top_p=voice.topP,
	)
	infer_config = InferConfig(
	batch_size=batch_size,
	spliter_threshold=spliter_threshold,
	eos=eos,
	seed=infer_seed,
	)
	adjust_config = AdjustConfig(
	speaking_rate=speaking_rate,
	pitch=pitch,
	volume_gain_db=volume_gain_db,
	)
	enhancer_config = enhancerConfig

	mime_type = f"audio/{audio_format.value}"
	if audio_format == AudioFormat.mp3:
	mime_type = "audio/mpeg"
	try:
	if input.text:
	text_content = input.text

	handler = TTSHandler(
	text_content=text_content,
	spk=speaker,
	tts_config=tts_config,
	infer_config=infer_config,
	adjust_config=adjust_config,
	enhancer_config=enhancer_config,
	)

	base64_string = handler.enqueue_to_base64(format=audio_format)
	return {"audioContent": f"data:{mime_type};base64,{base64_string}"}

	elif input.ssml:
	ssml_content = input.ssml

	handler = SSMLHandler(
	ssml_content=ssml_content,
	infer_config=infer_config,
	adjust_config=adjust_config,
	enhancer_config=enhancer_config,
	)

	base64_string = handler.enqueue_to_base64(format=audio_format)

	return {"audioContent": f"data:{mime_type};base64,{base64_string}"}

	else:
	raise HTTPException(
	status_code=422, detail="Invalid input text or ssml specified."
	)

	except Exception as e:
	import logging

	logging.exception(e)

	if isinstance(e, HTTPException):
	raise e
	else:
	raise HTTPException(status_code=500, detail=str(e))


	def setup(app: APIManager):
	app.post(
	"/v1/text:synthesize",
	response_model=GoogleTextSynthesizeResponse,
	description="""
	google api document: <br/>
	[https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize](https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize)

	- 多个属性在本系统中无用仅仅是为了兼容google api
	- voice 中的 topP, topK, temperature 为本系统中的参数
	- voice.name 即 speaker name （或者speaker seed）
	- voice.seed 为 infer seed （可在webui中测试具体作用）

	- 编码格式影响的是 audioContent 的二进制格式，所以所有format都是返回带有base64数据的json
	""",
	)(google_text_synthesize)