Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

Autodoc-Lifter / parsers.py

Jonathan Wang

initial commit

89cbc4d 3 months ago

3.72 kB

	#####################################################
	### DOCUMENT PROCESSOR [PARSERS]
	#####################################################
	# Jonathan Wang

	# ABOUT:
	# This project creates an app to chat with PDFs.

	# This is the PARSERS.
	# It chunks Raw Text into LlamaIndex nodes
	# E.g., by embedding meaning, by sentence, ...
	#####################################################
	# TODO Board:
	# Add more stuff

	#####################################################
	## IMPORTS
	from __future__ import annotations

	from typing import TYPE_CHECKING, Callable, List, Optional

	from llama_index.core import Settings
	from llama_index.core.node_parser import (
	SemanticSplitterNodeParser,
	SentenceWindowNodeParser,
	)

	if TYPE_CHECKING:
	from llama_index.core.base.embeddings.base import BaseEmbedding
	from llama_index.core.callbacks import CallbackManager
	from llama_index.core.node_parser.interface import NodeParser

	# from wtpsplit import SaT

	# Lazy Loading

	#####################################################
	## CODE
	# def sentence_splitter_from_SaT(model: Optional[SaT]) -> Callable[[str], List[str]]:
	# """Convert a SaT model into a sentence splitter function.

	# Args:
	# model (SaT): The Segment Anything model.

	# Returns:
	# Callable[[str], List[str]]: The sentence splitting function using the SaT model.
	# """
	# model = model or ss.model
	# if model is None:
	# raise ValueError("Sentence splitting model is not set.")

	# def sentence_splitter(text: str) -> List[str]:
	# segments = model.split(text_or_texts=text)
	# if isinstance(segments, list):
	# return segments
	# else:
	# return list(segments) # type: ignore (generator is the other option?)

	# return (sentence_splitter)

	# @st.cache_resource # can't cache because embed_model is not hashable.
	def get_parser(
	embed_model: BaseEmbedding,
	# sentence_model: Optional[SaT] = None,
	sentence_splitter: Optional[Callable[[str], List[str]]] = None,
	callback_manager: Optional[CallbackManager] = None
	) -> NodeParser:
	"""Parse RAG document processing (main one)."""
	# if (sentence_model is not None) and (sentence_splitter is not None):
	# sentence_splitter = sentence_splitter_from_SaT(sentence_model)

	return SemanticSplitterNodeParser.from_defaults(
	embed_model=embed_model,
	breakpoint_percentile_threshold=95,
	buffer_size=3,
	sentence_splitter=sentence_splitter,
	callback_manager=callback_manager or Settings.callback_manager,
	include_metadata=True,
	include_prev_next_rel=True,
	)


	# @st.cache_resource
	# def get_sentence_parser(splitter_model: Optional[SaT] = None) -> SentenceWindowNodeParser:
	# """Special sentence-level parser to get the document requested info section."""
	# if (splitter_model is not None):
	# sentence_splitter = sentence_splitter_from_SaT(splitter_model)

	# sentence_parser = SentenceWindowNodeParser.from_defaults(
	# sentence_splitter=sentence_splitter,
	# window_size=0,
	# window_metadata_key="window",
	# original_text_metadata_key="original_text",
	# )
	# return (sentence_parser)

	def get_sentence_parser() -> SentenceWindowNodeParser:
	"""Parse sentences to get the document requested info section."""
	# if (splitter_model is not None):
	# sentence_splitter = sentence_splitter_from_SaT(splitter_model)
	return SentenceWindowNodeParser.from_defaults(
	# sentence_splitter=sentence_splitter,
	window_size=0,
	window_metadata_key="window",
	original_text_metadata_key="original_text",
	)