Spaces:
Sleeping
Sleeping
File size: 1,598 Bytes
2669ae8 8d34b0a 2669ae8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import time
import boto3
from dotenv import load_dotenv
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.visualizers.entitylist import EntityList
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
AWS_REGION = os.getenv("AWS_REGION")
AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
def upload_to_s3(file_path, file_name):
s3 = boto3.client(
"s3",
region_name=AWS_REGION,
endpoint_url=AWS_ENDPOINT_URL,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME)
def analyze_pdf(file_name):
extractor = Textractor(region_name=AWS_REGION)
file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}"
document = extractor.start_document_analysis(
file_source=file,
features=[
TextractFeatures.LAYOUT,
TextractFeatures.TABLES,
# TextractFeatures.FORMS,
],
save_image=False,
)
text = ""
for page in document.pages:
text += page.get_text()
return text
def extract_text_from_pdf(file_path, file_name):
try:
upload_to_s3(file_path, file_name)
return analyze_pdf(file_name=file_name)
except Exception as e:
print("Error extracting text from PDF:", e)
|