Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

File size: 3,717 Bytes

89cbc4d

#####################################################
### DOCUMENT PROCESSOR [PARSERS]
#####################################################
# Jonathan Wang

# ABOUT:
# This project creates an app to chat with PDFs.

# This is the PARSERS.
# It chunks Raw Text into LlamaIndex nodes
# E.g., by embedding meaning, by sentence, ...
#####################################################
# TODO Board:
# Add more stuff

#####################################################
## IMPORTS
from __future__ import annotations

from typing import TYPE_CHECKING, Callable, List, Optional

from llama_index.core import Settings
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
    SentenceWindowNodeParser,
)

if TYPE_CHECKING:
    from llama_index.core.base.embeddings.base import BaseEmbedding
    from llama_index.core.callbacks import CallbackManager
    from llama_index.core.node_parser.interface import NodeParser

# from wtpsplit import SaT

# Lazy Loading

#####################################################
## CODE
# def sentence_splitter_from_SaT(model: Optional[SaT]) -> Callable[[str], List[str]]:
#     """Convert a SaT model into a sentence splitter function.

#     Args:
#         model (SaT): The Segment Anything model.

#     Returns:
#         Callable[[str], List[str]]: The sentence splitting function using the SaT model.
#     """
#     model = model or ss.model
#     if model is None:
#         raise ValueError("Sentence splitting model is not set.")

#     def sentence_splitter(text: str) -> List[str]:
#         segments = model.split(text_or_texts=text)
#         if isinstance(segments, list):
#             return segments
#         else:
#             return list(segments)  # type: ignore (generator is the other option?)

#     return (sentence_splitter)

# @st.cache_resource  # can't cache because embed_model is not hashable.
def get_parser(
        embed_model: BaseEmbedding,
        # sentence_model: Optional[SaT] = None,
        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
        callback_manager: Optional[CallbackManager] = None
    ) -> NodeParser:
    """Parse RAG document processing (main one)."""
    # if (sentence_model is not None) and (sentence_splitter is not None):
        # sentence_splitter = sentence_splitter_from_SaT(sentence_model)

    return SemanticSplitterNodeParser.from_defaults(
        embed_model=embed_model,
        breakpoint_percentile_threshold=95,
        buffer_size=3,
        sentence_splitter=sentence_splitter,
        callback_manager=callback_manager or Settings.callback_manager,
        include_metadata=True,
        include_prev_next_rel=True,
    )


# @st.cache_resource
# def get_sentence_parser(splitter_model: Optional[SaT] = None) -> SentenceWindowNodeParser:
#     """Special sentence-level parser to get the document requested info section."""
#     if (splitter_model is not None):
#         sentence_splitter = sentence_splitter_from_SaT(splitter_model)

#     sentence_parser = SentenceWindowNodeParser.from_defaults(
#         sentence_splitter=sentence_splitter,
#         window_size=0,
#         window_metadata_key="window",
#         original_text_metadata_key="original_text",
#     )
#     return (sentence_parser)

def get_sentence_parser() -> SentenceWindowNodeParser:
    """Parse sentences to get the document requested info section."""
    # if (splitter_model is not None):
    #     sentence_splitter = sentence_splitter_from_SaT(splitter_model)
    return SentenceWindowNodeParser.from_defaults(
        # sentence_splitter=sentence_splitter,
        window_size=0,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )