Spaces:
Paused
Paused
File size: 3,717 Bytes
89cbc4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
#####################################################
### DOCUMENT PROCESSOR [PARSERS]
#####################################################
# Jonathan Wang
# ABOUT:
# This project creates an app to chat with PDFs.
# This is the PARSERS.
# It chunks Raw Text into LlamaIndex nodes
# E.g., by embedding meaning, by sentence, ...
#####################################################
# TODO Board:
# Add more stuff
#####################################################
## IMPORTS
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, List, Optional
from llama_index.core import Settings
from llama_index.core.node_parser import (
SemanticSplitterNodeParser,
SentenceWindowNodeParser,
)
if TYPE_CHECKING:
from llama_index.core.base.embeddings.base import BaseEmbedding
from llama_index.core.callbacks import CallbackManager
from llama_index.core.node_parser.interface import NodeParser
# from wtpsplit import SaT
# Lazy Loading
#####################################################
## CODE
# def sentence_splitter_from_SaT(model: Optional[SaT]) -> Callable[[str], List[str]]:
# """Convert a SaT model into a sentence splitter function.
# Args:
# model (SaT): The Segment Anything model.
# Returns:
# Callable[[str], List[str]]: The sentence splitting function using the SaT model.
# """
# model = model or ss.model
# if model is None:
# raise ValueError("Sentence splitting model is not set.")
# def sentence_splitter(text: str) -> List[str]:
# segments = model.split(text_or_texts=text)
# if isinstance(segments, list):
# return segments
# else:
# return list(segments) # type: ignore (generator is the other option?)
# return (sentence_splitter)
# @st.cache_resource # can't cache because embed_model is not hashable.
def get_parser(
embed_model: BaseEmbedding,
# sentence_model: Optional[SaT] = None,
sentence_splitter: Optional[Callable[[str], List[str]]] = None,
callback_manager: Optional[CallbackManager] = None
) -> NodeParser:
"""Parse RAG document processing (main one)."""
# if (sentence_model is not None) and (sentence_splitter is not None):
# sentence_splitter = sentence_splitter_from_SaT(sentence_model)
return SemanticSplitterNodeParser.from_defaults(
embed_model=embed_model,
breakpoint_percentile_threshold=95,
buffer_size=3,
sentence_splitter=sentence_splitter,
callback_manager=callback_manager or Settings.callback_manager,
include_metadata=True,
include_prev_next_rel=True,
)
# @st.cache_resource
# def get_sentence_parser(splitter_model: Optional[SaT] = None) -> SentenceWindowNodeParser:
# """Special sentence-level parser to get the document requested info section."""
# if (splitter_model is not None):
# sentence_splitter = sentence_splitter_from_SaT(splitter_model)
# sentence_parser = SentenceWindowNodeParser.from_defaults(
# sentence_splitter=sentence_splitter,
# window_size=0,
# window_metadata_key="window",
# original_text_metadata_key="original_text",
# )
# return (sentence_parser)
def get_sentence_parser() -> SentenceWindowNodeParser:
"""Parse sentences to get the document requested info section."""
# if (splitter_model is not None):
# sentence_splitter = sentence_splitter_from_SaT(splitter_model)
return SentenceWindowNodeParser.from_defaults(
# sentence_splitter=sentence_splitter,
window_size=0,
window_metadata_key="window",
original_text_metadata_key="original_text",
)
|