Autodoc-Lifter / parsers.py
Jonathan Wang
initial commit
89cbc4d
#####################################################
### DOCUMENT PROCESSOR [PARSERS]
#####################################################
# Jonathan Wang
# ABOUT:
# This project creates an app to chat with PDFs.
# This is the PARSERS.
# It chunks Raw Text into LlamaIndex nodes
# E.g., by embedding meaning, by sentence, ...
#####################################################
# TODO Board:
# Add more stuff
#####################################################
## IMPORTS
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, List, Optional
from llama_index.core import Settings
from llama_index.core.node_parser import (
SemanticSplitterNodeParser,
SentenceWindowNodeParser,
)
if TYPE_CHECKING:
from llama_index.core.base.embeddings.base import BaseEmbedding
from llama_index.core.callbacks import CallbackManager
from llama_index.core.node_parser.interface import NodeParser
# from wtpsplit import SaT
# Lazy Loading
#####################################################
## CODE
# def sentence_splitter_from_SaT(model: Optional[SaT]) -> Callable[[str], List[str]]:
# """Convert a SaT model into a sentence splitter function.
# Args:
# model (SaT): The Segment Anything model.
# Returns:
# Callable[[str], List[str]]: The sentence splitting function using the SaT model.
# """
# model = model or ss.model
# if model is None:
# raise ValueError("Sentence splitting model is not set.")
# def sentence_splitter(text: str) -> List[str]:
# segments = model.split(text_or_texts=text)
# if isinstance(segments, list):
# return segments
# else:
# return list(segments) # type: ignore (generator is the other option?)
# return (sentence_splitter)
# @st.cache_resource # can't cache because embed_model is not hashable.
def get_parser(
embed_model: BaseEmbedding,
# sentence_model: Optional[SaT] = None,
sentence_splitter: Optional[Callable[[str], List[str]]] = None,
callback_manager: Optional[CallbackManager] = None
) -> NodeParser:
"""Parse RAG document processing (main one)."""
# if (sentence_model is not None) and (sentence_splitter is not None):
# sentence_splitter = sentence_splitter_from_SaT(sentence_model)
return SemanticSplitterNodeParser.from_defaults(
embed_model=embed_model,
breakpoint_percentile_threshold=95,
buffer_size=3,
sentence_splitter=sentence_splitter,
callback_manager=callback_manager or Settings.callback_manager,
include_metadata=True,
include_prev_next_rel=True,
)
# @st.cache_resource
# def get_sentence_parser(splitter_model: Optional[SaT] = None) -> SentenceWindowNodeParser:
# """Special sentence-level parser to get the document requested info section."""
# if (splitter_model is not None):
# sentence_splitter = sentence_splitter_from_SaT(splitter_model)
# sentence_parser = SentenceWindowNodeParser.from_defaults(
# sentence_splitter=sentence_splitter,
# window_size=0,
# window_metadata_key="window",
# original_text_metadata_key="original_text",
# )
# return (sentence_parser)
def get_sentence_parser() -> SentenceWindowNodeParser:
"""Parse sentences to get the document requested info section."""
# if (splitter_model is not None):
# sentence_splitter = sentence_splitter_from_SaT(splitter_model)
return SentenceWindowNodeParser.from_defaults(
# sentence_splitter=sentence_splitter,
window_size=0,
window_metadata_key="window",
original_text_metadata_key="original_text",
)