import os import time import boto3 from dotenv import load_dotenv from textractor import Textractor from textractor.data.constants import TextractFeatures from textractor.data.text_linearization_config import TextLinearizationConfig from textractor.visualizers.entitylist import EntityList load_dotenv() AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL") AWS_REGION = os.getenv("AWS_REGION") AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME") def upload_to_s3(file_path, file_name): s3 = boto3.client( "s3", region_name=AWS_REGION, endpoint_url=AWS_ENDPOINT_URL, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, ) s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME) def analyze_pdf(file_name): extractor = Textractor(region_name=AWS_REGION) file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}" document = extractor.start_document_analysis( file_source=file, features=[ TextractFeatures.LAYOUT, TextractFeatures.TABLES, # TextractFeatures.FORMS, ], save_image=False, ) text = "" for page in document.pages: text += page.get_text() return text def extract_text_from_pdf(file_path, file_name): try: upload_to_s3(file_path, file_name) return analyze_pdf(file_name=file_name) except Exception as e: print("Error extracting text from PDF:", e)