Spaces:
Running
Running
"""Unstructured file reader. | |
A parser for unstructured text files using Unstructured.io. | |
Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents. | |
""" | |
from datetime import datetime | |
import mimetypes | |
import os | |
from pathlib import Path | |
from typing import Any, Dict, List, Optional | |
from llama_index.readers.base import BaseReader | |
from llama_index.readers.schema.base import Document | |
class UnstructuredReader(BaseReader): | |
"""General unstructured text reader for a variety of files.""" | |
def __init__(self, *args: Any, **kwargs: Any) -> None: | |
"""Init params.""" | |
super().__init__(*args, **kwargs) | |
# Prerequisite for Unstructured.io to work | |
import nltk | |
nltk.download("punkt") | |
nltk.download("averaged_perceptron_tagger") | |
def load_data( | |
self, | |
file: Path, | |
extra_info: Optional[Dict] = None, | |
split_documents: Optional[bool] = True, | |
) -> List[Document]: | |
"""Parse file.""" | |
from unstructured.partition.auto import partition | |
elements = partition(str(file)) | |
text_chunks = [" ".join(str(el).split()) for el in elements] | |
if split_documents: | |
return [ | |
Document(text=chunk, extra_info=extra_info or {}) | |
for chunk in text_chunks | |
] | |
else: | |
return [ | |
Document(text="\n\n".join(text_chunks), extra_info=extra_info or {}) | |
] | |
def default_file_metadata_func(file_path: str) -> Dict: | |
"""Get some handy metadate from filesystem. | |
Args: | |
file_path: str: file path in str | |
""" | |
return { | |
"file_path": file_path, | |
"file_name": os.path.basename(file_path), | |
"file_type": mimetypes.guess_type(file_path)[0], | |
"file_size": os.path.getsize(file_path), | |
"creation_date": datetime.fromtimestamp( | |
Path(file_path).stat().st_ctime | |
).strftime("%Y-%m-%d"), | |
"last_modified_date": datetime.fromtimestamp( | |
Path(file_path).stat().st_mtime | |
).strftime("%Y-%m-%d"), | |
"last_accessed_date": datetime.fromtimestamp( | |
Path(file_path).stat().st_atime | |
).strftime("%Y-%m-%d"), | |
} |