Spaces:

jlopez00
/

tts-service

Runtime error

App Files Files Community

tts-service / tts_service /docs.py

jlopez00

Upload folder using huggingface_hub

b3385db verified 21 days ago

raw

history blame

4.03 kB

	import json
	import logging
	import re
	from typing import Any, TypedDict

	import httpx

	from tts_service.utils import env_str


	class WorkflowData(TypedDict):
	status: str
	writers: list[str]
	editors: list[str]
	proofers: list[str]
	reviewers: list[str]
	proofingDeadline: str


	class Document(TypedDict):
	_id: str
	_rev: str
	type: str
	mimetype: str
	title: str
	language: str
	workflowData: WorkflowData
	path: str
	name: str
	created: int
	creator: str
	lastPublished: int
	firstPublished: int
	modified: int
	modifier: str
	published: int
	authors: list[str]
	content: str
	contentAssets: list[str]
	featuredImages: list[str]
	keywords: list[str]
	topics: list[str]
	relatedAssets: list[str]
	comments: bool
	campaignConfigs: list[Any]
	order: int
	overline: str
	translatedFrom: str
	socialTitles: list[Any]
	socialDescriptions: list[Any]
	socialFeaturedImages: list[Any]
	underline: str
	template: str
	description: str
	suggestedImages: list[str]
	publisher: str


	class DocumentManager:
	def __init__(self) -> None:
	self.client = self.make_client()
	self.path_view = env_str("DOCS_PATH_VIEW")

	def make_client(self) -> httpx.AsyncClient:
	base_url = env_str("DOCS_URL")
	auth = env_str("DOCS_AUTH")
	headers = {"Authorization": f"Basic {auth}"}
	client = httpx.AsyncClient(base_url=base_url, headers=headers)
	return client

	async def get_doc_by_id(self, doc_id: str) -> Document \| None:
	try:
	response = await self.client.get(doc_id)
	if response.status_code == 404:
	return None
	response.raise_for_status()
	return response.json()
	except Exception as e:
	logging.error("Error fetching document by ID", exc_info=e)
	return None

	async def get_doc_by_path(self, path: str) -> Document \| None:
	try:
	params = {
	"limit": "1",
	"key": json.dumps(path),
	"include_docs": "true",
	}
	response = await self.client.get(self.path_view, params=params)
	response.raise_for_status()
	data = response.json()
	rows = data["rows"]
	if not rows:
	return None
	return rows[0]["doc"]
	except Exception as e:
	logging.error("Error fetching document by path", exc_info=e)
	return None

	async def get_doc(self, id_or_path: str) -> Document \| None:
	uuids = extract_doc_ids(id_or_path)
	for uuid in uuids:
	doc = await self.get_doc_by_id(uuid)
	if doc:
	return doc

	path = extract_doc_path(id_or_path)
	if path:
	return await self.get_doc_by_path(path)

	return None


	UUID_PATTERN = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}")


	def extract_doc_ids(s: str) -> list[str]:
	return UUID_PATTERN.findall(s)


	def extract_doc_path(s: str) -> str \| None:
	if not s.endswith(".html"):
	return None
	if s.startswith("/"):
	return s
	if "://" in s:
	s = s.split("://", 1)[1]
	if "/" in s:
	return "/" + s.split("/", 1)[1]
	return None


	document_manager = DocumentManager()


	if __name__ == "__main__":

	async def main() -> None:
	db = DocumentManager()
	# result = await db.get_doc_by_id("b7fdc644-5b24-40ae-b489-37b3fc0c5541")
	# result = await db.get_doc_by_path("/en/articles/2024/11/28/slci-n28.html")
	# result = await db.get_doc("https://www.cnn.com/en/articles/2024/11/28/slci-n28.html")
	result = await db.get_doc("https://bbc.com/news/the-2024-us-elections-efb37bf1-16bb-4bbb-88ce-4273cf657c11")
	print(json.dumps(result, indent=2))

	import asyncio

	from dotenv import load_dotenv

	load_dotenv()
	asyncio.run(main())