Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

Autodoc-Lifter / metadata_adder.py

Jonathan Wang

initial commit

89cbc4d 3 months ago

11.2 kB

	#####################################################
	### DOCUMENT PROCESSOR [Metadata Adders]
	#####################################################
	### Jonathan Wang

	# ABOUT:
	# This creates an app to chat with PDFs.

	# This is the Metadata Adders
	# Which are classes that add metadata fields to documents.
	# This often is used for summaries or keywords.
	#####################################################
	### TODO Board:
	# Seems like this overlaps well with the `metadata extractors` interface from llama_index.
	# These are TransformComponents which take a Sequence of Nodes as input, and returns a list of Dicts as output (with the dicts storing metdata for each node).
	# We should add a wrapper which adds this metadata to nodes.
	# We should also add a wrapper

	# https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/interface.py#L21
	# https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/metadata_extractors.py#L332

	#####################################################
	### PROGRAM SETTINGS


	#####################################################
	### PROGRAM IMPORTS
	from __future__ import annotations

	import logging
	import re
	from abc import abstractmethod
	from typing import Any, List, Optional, TypeVar, Sequence

	from llama_index.core.bridge.pydantic import Field, PrivateAttr
	from llama_index.core.schema import BaseNode, TransformComponent

	# Own modules


	#####################################################
	### CONSTANTS
	# ah how beautiful the regex
	# handy visualizer and checker: https://www.debuggex.com/, https://www.regexpr.com/
	logger = logging.getLogger(__name__)
	GenericNode = TypeVar("GenericNode", bound=BaseNode)

	DATE_REGEX = re.compile(r"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st\|nd\|rd\|th)?\s+(?:of\s+)?(?:jan\.?\|january\|feb\.?\|february\|mar\.?\|march\|apr\.?\|april\|may\|jun\.?\|june\|jul\.?\|july\|aug\.?\|august\|sep\.?\|september\|oct\.?\|october\|nov\.?\|november\|dec\.?\|december)\|(?:jan\.?\|january\|feb\.?\|february\|mar\.?\|march\|apr\.?\|april\|may\|jun\.?\|june\|jul\.?\|july\|aug\.?\|august\|sep\.?\|september\|oct\.?\|october\|nov\.?\|november\|dec\.?\|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st\|nd\|rd\|th)?)(?:\,)?\s*(?:\d{4})?\|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}", re.IGNORECASE)
	TIME_REGEX = re.compile(r"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?\|\d[ap]\.?m\.?", re.IGNORECASE)
	EMAIL_REGEX = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
	PHONE_REGEX = re.compile(r"((?:(?<![\d-])(?:\+?\d{1,3}[-.\s]?)?(?:$?\d{3}$?[-.\s]?)?\d{3}[-.\s]?\d{4}(?![\d-]))\|(?:(?<![\d-])(?:(?:$\+?\d{2}$)\|(?:\+?\d{2}))\s\d{2}\s\d{3}\s\d{4}(?![\d-])))")
	MAIL_ADDR_REGEX = re.compile(r"\d{1,4}.{1,10}[\w\s]{1,20}[\s]+(?:street\|st\|avenue\|ave\|road\|rd\|highway\|hwy\|square\|sq\|trail\|trl\|drive\|dr\|court\|ct\|parkway\|pkwy\|circle\|cir\|boulevard\|blvd)\W?(?=\s\|$)", re.IGNORECASE)

	# DEFAULT_NUM_WORKERS = os.cpu_count() - 1 if os.cpu_count() else 1 # type: ignore


	#####################################################
	### SCRIPT

	class MetadataAdder(TransformComponent):
	"""Adds metadata to a node.

	Args:
	metadata_name: The name of the metadata to add to the node. Defaults to 'metadata'.
	# num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one. currently WIP.
	"""

	metadata_name: str = Field(
	default="metadata",
	description="The name of the metadata field to add to the document. Defaults to 'metadata'.",
	)
	# num_workers: int = Field(
	# default=DEFAULT_NUM_WORKERS,
	# description="The number of workers to use for parallel processing. By default, use all available cores minus one.",
	# )

	def __init__(
	self, metadata_name: str = "metadata", **kwargs: Any
	) -> None:
	super().__init__(**kwargs)
	self.metadata_name = metadata_name
	# self.num_workers = num_workers

	@classmethod
	def class_name(cls) -> str:
	return "MetadataAdder"

	@abstractmethod
	def get_node_metadata(self, node: BaseNode) -> str \| None:
	"""Given a node, get the metadata for the node."""

	def add_node_metadata(self, node: GenericNode, metadata_value: Any \| None) -> GenericNode:
	"""Given a node and the metadata, add the metadata to the node's `metadata_name` field."""
	if (metadata_value is None):
	return node
	else:
	node.metadata[self.metadata_name] = metadata_value
	return node

	def process_nodes(self, nodes: list[GenericNode]) -> list[GenericNode]:
	"""Process the list of nodes. This gets called by __call__.

	Args:
	nodes (List[GenericNode]): The nodes to process.

	Returns:
	List[GenericNode]: The processed nodes, with metadata field metadata_name added.
	"""
	output_nodes = []
	for node in nodes:
	node_metadata = self.get_node_metadata(node)
	node_with_metadata = self.add_node_metadata(node, node_metadata)
	output_nodes.append(node_with_metadata)
	return(output_nodes)

	def __call__(self, nodes: Sequence[BaseNode], **kwargs: Any) -> list[BaseNode]:
	"""Check whether nodes have the specified regex pattern."""
	return self.process_nodes(nodes)


	class RegexMetadataAdder(MetadataAdder):
	"""Adds regex metadata to a document.

	Args:
	regex_pattern: The regex pattern to search for.
	metadata_name: The name of the metadata to add to the document. Defaults to 'regex_metadata'.
	# num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one.
	"""

	_regex_pattern: re.Pattern = PrivateAttr()
	_boolean_mode: bool = PrivateAttr()
	# num_workers: int = Field(
	# default=DEFAULT_NUM_WORKERS,
	# description="The number of workers to use for parallel processing. By default, use all available cores minus one.",
	# )

	def __init__(
	self,
	regex_pattern: re.Pattern \| str = DATE_REGEX,
	metadata_name: str = "regex_metadata",
	boolean_mode: bool = False,
	# num_workers: int = DEFAULT_NUM_WORKERS,
	**kwargs: Any,
	) -> None:
	"""Init params."""
	if (isinstance(regex_pattern, str)):
	regex_pattern = re.compile(regex_pattern)
	# self.num_workers = num_workers
	super().__init__(metadata_name=metadata_name, **kwargs) # ah yes i love oop :)
	self._regex_pattern=regex_pattern
	self._boolean_mode=boolean_mode

	@classmethod
	def class_name(cls) -> str:
	return "RegexMetadataAdder"

	def get_node_metadata(self, node: BaseNode) -> str \| None:
	"""Given a node with text, return the regex match if it exists.

	Args:
	node (BaseNode): The base node to extract from.

	Returns:
	Optional[str]: The regex match if it exists. If not, return None.
	"""
	if (getattr(node, "text", None) is None):
	return None

	if (self._boolean_mode):
	return str(self._regex_pattern.match(node.text) is not None)
	else:
	return str(self._regex_pattern.findall(node.text)) # NOTE: we are saving these as a string'd list since this is easier


	class ModelMetadataAdder(MetadataAdder):
	"""Adds metadata to nodes based on a language model."""

	prompt_template: str = Field(
	description="The prompt to use to generate the metadata. Defaults to DEFAULT_SUMMARY_TEMPLATE.",
	)

	def __init__(
	self,
	metadata_name: str,
	prompt_template: str \| None = None,
	**kwargs: Any
	) -> None:
	"""Init params."""
	super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs)

	@classmethod
	def class_name(cls) -> str:
	return "ModelMetadataAdder"

	@abstractmethod
	def get_node_metadata(self, node: BaseNode) -> str \| None:
	"""Given a node, get the metadata for the node.

	Args:
	node (BaseNode): The node to add metadata to.

	Returns:
	Optional[str]: The metadata if it exists. If not, return None.
	"""


	class UnstructuredPDFPostProcessor(TransformComponent):
	"""Handles postprocessing of PDF which was read in using UnstructuredIO."""

	### NOTE: okay technically we could have done this in the IngestionPipeline abstraction. Maybe we integrate in the future?
	# This component doesn't play nice with multi-processing due to having non-async LLMs.

	# _embed_model: Optional[BaseEmbedding] = PrivateAttr()
	_metadata_adders: list[MetadataAdder] = PrivateAttr()

	def __init__(
	self,
	# embed_model: Optional[BaseEmbedding] = None,
	metadata_adders: list[MetadataAdder] \| None = None,
	**kwargs: Any,
	) -> None:
	super().__init__(**kwargs)
	# self._embed_model = embed_model or Settings.embed_model
	self._metadata_adders = metadata_adders or []

	@classmethod
	def class_name(cls) -> str:
	return "UnstructuredPDFPostProcessor"

	# def _apply_embed_model(self, nodes: List[BaseNode]) -> List[BaseNode]:
	# if (self._embed_model is not None):
	# nodes = self._embed_model(nodes)
	# return nodes

	def _apply_metadata_adders(self, nodes: list[GenericNode]) -> list[GenericNode]:
	for metadata_adder in self._metadata_adders:
	nodes = metadata_adder(nodes)
	return nodes

	def __call__(self, nodes: list[GenericNode], **kwargs: Any) -> Sequence[BaseNode]:
	return self._apply_metadata_adders(nodes)
	# nodes = self._apply_embed_model(nodes) # this goes second in case we want to embed the metadata.

	# def has_email(input_text: str) -> bool:
	# """
	# Given a chunk of text, determine whether it has an email address or not.

	# We're using the long complex email regex from https://emailregex.com/index.html
	# """
	# return (EMAIL_REGEX.search(input_text) is not None)


	# def has_phone(input_text: str) -> bool:
	# """
	# Given a chunk of text, determine whether it has a phone number or not.
	# """
	# has_phone = PHONE_REGEX.search(input_text)
	# return (has_phone is not None)


	# def has_mail_addr(input_text: str) -> bool:
	# """
	# Given a chunk of text, determine whether it has a mailing address or not.

	# NOTE: This is difficult to do with regex.
	# ... We could use spacy's English language NER model instead / as well:
	# Assume that addresses will have a GSP (geospatial political) or GPE (geopolitical entity).
	# DOCS SEE: https://www.nltk.org/book/ch07.html \| https://spacy.io/usage/linguistic-features
	# """
	# has_addr = MAIL_ADDR_REGEX.search(input_text)
	# return (has_addr is not None)


	# def has_date(input_text: str) -> bool:
	# """
	# Given a chunk of text, determine whether it has a date or not.
	# NOTE: relative dates are stuff like "within 30 days"
	# """
	# has_date = DATE_REGEX.search(input_text)
	# return (has_date is not None)