File size: 4,017 Bytes
b3385db
 
 
 
 
 
 
c6fd5b2
b3385db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import json
import logging
import re
from typing import Any, TypedDict

import httpx

from .utils import env_str


class WorkflowData(TypedDict):
    status: str
    writers: list[str]
    editors: list[str]
    proofers: list[str]
    reviewers: list[str]
    proofingDeadline: str


class Document(TypedDict):
    _id: str
    _rev: str
    type: str
    mimetype: str
    title: str
    language: str
    workflowData: WorkflowData
    path: str
    name: str
    created: int
    creator: str
    lastPublished: int
    firstPublished: int
    modified: int
    modifier: str
    published: int
    authors: list[str]
    content: str
    contentAssets: list[str]
    featuredImages: list[str]
    keywords: list[str]
    topics: list[str]
    relatedAssets: list[str]
    comments: bool
    campaignConfigs: list[Any]
    order: int
    overline: str
    translatedFrom: str
    socialTitles: list[Any]
    socialDescriptions: list[Any]
    socialFeaturedImages: list[Any]
    underline: str
    template: str
    description: str
    suggestedImages: list[str]
    publisher: str


class DocumentManager:
    def __init__(self) -> None:
        self.client = self.make_client()
        self.path_view = env_str("DOCS_PATH_VIEW")

    def make_client(self) -> httpx.AsyncClient:
        base_url = env_str("DOCS_URL")
        auth = env_str("DOCS_AUTH")
        headers = {"Authorization": f"Basic {auth}"}
        client = httpx.AsyncClient(base_url=base_url, headers=headers)
        return client

    async def get_doc_by_id(self, doc_id: str) -> Document | None:
        try:
            response = await self.client.get(doc_id)
            if response.status_code == 404:
                return None
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logging.error("Error fetching document by ID", exc_info=e)
            return None

    async def get_doc_by_path(self, path: str) -> Document | None:
        try:
            params = {
                "limit": "1",
                "key": json.dumps(path),
                "include_docs": "true",
            }
            response = await self.client.get(self.path_view, params=params)
            response.raise_for_status()
            data = response.json()
            rows = data["rows"]
            if not rows:
                return None
            return rows[0]["doc"]
        except Exception as e:
            logging.error("Error fetching document by path", exc_info=e)
            return None

    async def get_doc(self, id_or_path: str) -> Document | None:
        uuids = extract_doc_ids(id_or_path)
        for uuid in uuids:
            doc = await self.get_doc_by_id(uuid)
            if doc:
                return doc

        path = extract_doc_path(id_or_path)
        if path:
            return await self.get_doc_by_path(path)

        return None


UUID_PATTERN = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}")


def extract_doc_ids(s: str) -> list[str]:
    return UUID_PATTERN.findall(s)


def extract_doc_path(s: str) -> str | None:
    if not s.endswith(".html"):
        return None
    if s.startswith("/"):
        return s
    if "://" in s:
        s = s.split("://", 1)[1]
        if "/" in s:
            return "/" + s.split("/", 1)[1]
    return None


document_manager = DocumentManager()


if __name__ == "__main__":

    async def main() -> None:
        db = DocumentManager()
        # result = await db.get_doc_by_id("b7fdc644-5b24-40ae-b489-37b3fc0c5541")
        # result = await db.get_doc_by_path("/en/articles/2024/11/28/slci-n28.html")
        # result = await db.get_doc("https://www.cnn.com/en/articles/2024/11/28/slci-n28.html")
        result = await db.get_doc("https://bbc.com/news/the-2024-us-elections-efb37bf1-16bb-4bbb-88ce-4273cf657c11")
        print(json.dumps(result, indent=2))

    import asyncio

    from dotenv import load_dotenv

    load_dotenv()
    asyncio.run(main())