"""Unstructured file reader. A parser for unstructured text files using Unstructured.io. Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents. """ from datetime import datetime import mimetypes import os from pathlib import Path import re from typing import Any, Dict, List, Optional from llama_index.core.readers.base import BaseReader from llama_index.core import Document class UnstructuredReader(BaseReader): """General unstructured text reader for a variety of files.""" def __init__(self, *args: Any, **kwargs: Any) -> None: """Init params.""" super().__init__(*args, **kwargs) # Prerequisite for Unstructured.io to work import nltk nltk.download("punkt") nltk.download("averaged_perceptron_tagger") def load_data( self, file: Path, extra_info: Optional[Dict] = None, split_documents: Optional[bool] = True, ) -> List[Document]: """Parse file.""" from unstructured.partition.auto import partition elements = partition(str(file)) text_chunks = [" ".join(str(el).split()) for el in elements] if split_documents: return [ Document(text=chunk, extra_info=extra_info or {}) for chunk in text_chunks ] else: return [ Document(text="\n\n".join(text_chunks), extra_info=extra_info or {}) ] class MarkdownReader(BaseReader): """General unstructured text reader for a variety of files.""" def __init__(self, *args: Any, **kwargs: Any) -> None: """Init params.""" super().__init__(*args, **kwargs) def load_data( self, file: Path, extra_info: Optional[Dict] = None, split_documents: Optional[bool] = True, ) -> List[Document]: """Parse file.""" from unstructured.partition.auto import partition elements = parse_knowledge_units(str(file)) if split_documents: return [ Document(text=ele, extra_info=extra_info or {}) for ele in elements ] def parse_knowledge_units(file_path): with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() knowledge_units = [] current_unit = "" unit_start_pattern = re.compile(r'^\d+\.\s') for line in lines: stripped_line = line.strip() if unit_start_pattern.match(stripped_line): if current_unit: knowledge_units.append(current_unit.strip()) current_unit = "" current_unit += line else: current_unit += line if current_unit: knowledge_units.append(current_unit.strip()) # for line in lines: # if line.strip() and line[0].isdigit() and '.' in line: # if current_unit: # knowledge_units.append(current_unit.strip()) # current_unit = "" # current_unit += line # else: # current_unit += line # if current_unit: # knowledge_units.append(current_unit.strip()) return knowledge_units def default_file_metadata_func(file_path: str) -> Dict: """Get some handy metadate from filesystem. Args: file_path: str: file path in str """ return { "file_path": file_path, "file_name": os.path.basename(file_path), "file_type": mimetypes.guess_type(file_path)[0], "file_size": os.path.getsize(file_path), "creation_date": datetime.fromtimestamp( Path(file_path).stat().st_ctime ).strftime("%Y-%m-%d"), "last_modified_date": datetime.fromtimestamp( Path(file_path).stat().st_mtime ).strftime("%Y-%m-%d"), "last_accessed_date": datetime.fromtimestamp( Path(file_path).stat().st_atime ).strftime("%Y-%m-%d"), }