Spaces:
Paused
Paused
##################################################### | |
### DOCUMENT PROCESSOR [PARSERS] | |
##################################################### | |
# Jonathan Wang | |
# ABOUT: | |
# This project creates an app to chat with PDFs. | |
# This is the PARSERS. | |
# It chunks Raw Text into LlamaIndex nodes | |
# E.g., by embedding meaning, by sentence, ... | |
##################################################### | |
# TODO Board: | |
# Add more stuff | |
##################################################### | |
## IMPORTS | |
from __future__ import annotations | |
from typing import TYPE_CHECKING, Callable, List, Optional | |
from llama_index.core import Settings | |
from llama_index.core.node_parser import ( | |
SemanticSplitterNodeParser, | |
SentenceWindowNodeParser, | |
) | |
if TYPE_CHECKING: | |
from llama_index.core.base.embeddings.base import BaseEmbedding | |
from llama_index.core.callbacks import CallbackManager | |
from llama_index.core.node_parser.interface import NodeParser | |
# from wtpsplit import SaT | |
# Lazy Loading | |
##################################################### | |
## CODE | |
# def sentence_splitter_from_SaT(model: Optional[SaT]) -> Callable[[str], List[str]]: | |
# """Convert a SaT model into a sentence splitter function. | |
# Args: | |
# model (SaT): The Segment Anything model. | |
# Returns: | |
# Callable[[str], List[str]]: The sentence splitting function using the SaT model. | |
# """ | |
# model = model or ss.model | |
# if model is None: | |
# raise ValueError("Sentence splitting model is not set.") | |
# def sentence_splitter(text: str) -> List[str]: | |
# segments = model.split(text_or_texts=text) | |
# if isinstance(segments, list): | |
# return segments | |
# else: | |
# return list(segments) # type: ignore (generator is the other option?) | |
# return (sentence_splitter) | |
# @st.cache_resource # can't cache because embed_model is not hashable. | |
def get_parser( | |
embed_model: BaseEmbedding, | |
# sentence_model: Optional[SaT] = None, | |
sentence_splitter: Optional[Callable[[str], List[str]]] = None, | |
callback_manager: Optional[CallbackManager] = None | |
) -> NodeParser: | |
"""Parse RAG document processing (main one).""" | |
# if (sentence_model is not None) and (sentence_splitter is not None): | |
# sentence_splitter = sentence_splitter_from_SaT(sentence_model) | |
return SemanticSplitterNodeParser.from_defaults( | |
embed_model=embed_model, | |
breakpoint_percentile_threshold=95, | |
buffer_size=3, | |
sentence_splitter=sentence_splitter, | |
callback_manager=callback_manager or Settings.callback_manager, | |
include_metadata=True, | |
include_prev_next_rel=True, | |
) | |
# @st.cache_resource | |
# def get_sentence_parser(splitter_model: Optional[SaT] = None) -> SentenceWindowNodeParser: | |
# """Special sentence-level parser to get the document requested info section.""" | |
# if (splitter_model is not None): | |
# sentence_splitter = sentence_splitter_from_SaT(splitter_model) | |
# sentence_parser = SentenceWindowNodeParser.from_defaults( | |
# sentence_splitter=sentence_splitter, | |
# window_size=0, | |
# window_metadata_key="window", | |
# original_text_metadata_key="original_text", | |
# ) | |
# return (sentence_parser) | |
def get_sentence_parser() -> SentenceWindowNodeParser: | |
"""Parse sentences to get the document requested info section.""" | |
# if (splitter_model is not None): | |
# sentence_splitter = sentence_splitter_from_SaT(splitter_model) | |
return SentenceWindowNodeParser.from_defaults( | |
# sentence_splitter=sentence_splitter, | |
window_size=0, | |
window_metadata_key="window", | |
original_text_metadata_key="original_text", | |
) | |