AWEsumCare-Demo / custom_io.py
ray
initial commit
dfc6dc5
raw
history blame
2.22 kB
"""Unstructured file reader.
A parser for unstructured text files using Unstructured.io.
Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.
"""
from datetime import datetime
import mimetypes
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class UnstructuredReader(BaseReader):
"""General unstructured text reader for a variety of files."""
def __init__(self, *args: Any, **kwargs: Any) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
# Prerequisite for Unstructured.io to work
import nltk
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
split_documents: Optional[bool] = True,
) -> List[Document]:
"""Parse file."""
from unstructured.partition.auto import partition
elements = partition(str(file))
text_chunks = [" ".join(str(el).split()) for el in elements]
if split_documents:
return [
Document(text=chunk, extra_info=extra_info or {})
for chunk in text_chunks
]
else:
return [
Document(text="\n\n".join(text_chunks), extra_info=extra_info or {})
]
def default_file_metadata_func(file_path: str) -> Dict:
"""Get some handy metadate from filesystem.
Args:
file_path: str: file path in str
"""
return {
"file_path": file_path,
"file_name": os.path.basename(file_path),
"file_type": mimetypes.guess_type(file_path)[0],
"file_size": os.path.getsize(file_path),
"creation_date": datetime.fromtimestamp(
Path(file_path).stat().st_ctime
).strftime("%Y-%m-%d"),
"last_modified_date": datetime.fromtimestamp(
Path(file_path).stat().st_mtime
).strftime("%Y-%m-%d"),
"last_accessed_date": datetime.fromtimestamp(
Path(file_path).stat().st_atime
).strftime("%Y-%m-%d"),
}