Spaces:
Sleeping
Sleeping
import os | |
import time | |
import boto3 | |
from dotenv import load_dotenv | |
from textractor import Textractor | |
from textractor.data.constants import TextractFeatures | |
from textractor.data.text_linearization_config import TextLinearizationConfig | |
from textractor.visualizers.entitylist import EntityList | |
load_dotenv() | |
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") | |
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") | |
AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL") | |
AWS_REGION = os.getenv("AWS_REGION") | |
AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME") | |
def upload_to_s3(file_path, file_name): | |
s3 = boto3.client( | |
"s3", | |
region_name=AWS_REGION, | |
endpoint_url=AWS_ENDPOINT_URL, | |
aws_access_key_id=AWS_ACCESS_KEY_ID, | |
aws_secret_access_key=AWS_SECRET_ACCESS_KEY, | |
) | |
s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME) | |
def analyze_pdf(file_name): | |
extractor = Textractor(region_name=AWS_REGION) | |
file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}" | |
document = extractor.start_document_analysis( | |
file_source=file, | |
features=[ | |
TextractFeatures.LAYOUT, | |
TextractFeatures.TABLES, | |
# TextractFeatures.FORMS, | |
], | |
save_image=False, | |
) | |
text = "" | |
for page in document.pages: | |
text += page.get_text() | |
return text | |
def extract_text_from_pdf(file_path, file_name): | |
try: | |
upload_to_s3(file_path, file_name) | |
return analyze_pdf(file_name=file_name) | |
except Exception as e: | |
print("Error extracting text from PDF:", e) | |