Spaces:
Runtime error
Runtime error
File size: 4,017 Bytes
b3385db c6fd5b2 b3385db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import json
import logging
import re
from typing import Any, TypedDict
import httpx
from .utils import env_str
class WorkflowData(TypedDict):
status: str
writers: list[str]
editors: list[str]
proofers: list[str]
reviewers: list[str]
proofingDeadline: str
class Document(TypedDict):
_id: str
_rev: str
type: str
mimetype: str
title: str
language: str
workflowData: WorkflowData
path: str
name: str
created: int
creator: str
lastPublished: int
firstPublished: int
modified: int
modifier: str
published: int
authors: list[str]
content: str
contentAssets: list[str]
featuredImages: list[str]
keywords: list[str]
topics: list[str]
relatedAssets: list[str]
comments: bool
campaignConfigs: list[Any]
order: int
overline: str
translatedFrom: str
socialTitles: list[Any]
socialDescriptions: list[Any]
socialFeaturedImages: list[Any]
underline: str
template: str
description: str
suggestedImages: list[str]
publisher: str
class DocumentManager:
def __init__(self) -> None:
self.client = self.make_client()
self.path_view = env_str("DOCS_PATH_VIEW")
def make_client(self) -> httpx.AsyncClient:
base_url = env_str("DOCS_URL")
auth = env_str("DOCS_AUTH")
headers = {"Authorization": f"Basic {auth}"}
client = httpx.AsyncClient(base_url=base_url, headers=headers)
return client
async def get_doc_by_id(self, doc_id: str) -> Document | None:
try:
response = await self.client.get(doc_id)
if response.status_code == 404:
return None
response.raise_for_status()
return response.json()
except Exception as e:
logging.error("Error fetching document by ID", exc_info=e)
return None
async def get_doc_by_path(self, path: str) -> Document | None:
try:
params = {
"limit": "1",
"key": json.dumps(path),
"include_docs": "true",
}
response = await self.client.get(self.path_view, params=params)
response.raise_for_status()
data = response.json()
rows = data["rows"]
if not rows:
return None
return rows[0]["doc"]
except Exception as e:
logging.error("Error fetching document by path", exc_info=e)
return None
async def get_doc(self, id_or_path: str) -> Document | None:
uuids = extract_doc_ids(id_or_path)
for uuid in uuids:
doc = await self.get_doc_by_id(uuid)
if doc:
return doc
path = extract_doc_path(id_or_path)
if path:
return await self.get_doc_by_path(path)
return None
UUID_PATTERN = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}")
def extract_doc_ids(s: str) -> list[str]:
return UUID_PATTERN.findall(s)
def extract_doc_path(s: str) -> str | None:
if not s.endswith(".html"):
return None
if s.startswith("/"):
return s
if "://" in s:
s = s.split("://", 1)[1]
if "/" in s:
return "/" + s.split("/", 1)[1]
return None
document_manager = DocumentManager()
if __name__ == "__main__":
async def main() -> None:
db = DocumentManager()
# result = await db.get_doc_by_id("b7fdc644-5b24-40ae-b489-37b3fc0c5541")
# result = await db.get_doc_by_path("/en/articles/2024/11/28/slci-n28.html")
# result = await db.get_doc("https://www.cnn.com/en/articles/2024/11/28/slci-n28.html")
result = await db.get_doc("https://bbc.com/news/the-2024-us-elections-efb37bf1-16bb-4bbb-88ce-4273cf657c11")
print(json.dumps(result, indent=2))
import asyncio
from dotenv import load_dotenv
load_dotenv()
asyncio.run(main())
|