File size: 1,598 Bytes
2669ae8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d34b0a
2669ae8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import time

import boto3
from dotenv import load_dotenv
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.visualizers.entitylist import EntityList

load_dotenv()

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_ENDPOINT_URL = os.getenv("AWS_ENDPOINT_URL")
AWS_REGION = os.getenv("AWS_REGION")
AWS_S3_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")


def upload_to_s3(file_path, file_name):
    s3 = boto3.client(
        "s3",
        region_name=AWS_REGION,
        endpoint_url=AWS_ENDPOINT_URL,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    s3.upload_file(Filename=file_path, Key=file_name, Bucket=AWS_S3_BUCKET_NAME)


def analyze_pdf(file_name):
    extractor = Textractor(region_name=AWS_REGION)
    file = f"s3://{AWS_S3_BUCKET_NAME}/{file_name}"
    document = extractor.start_document_analysis(
        file_source=file,
        features=[
            TextractFeatures.LAYOUT,
            TextractFeatures.TABLES,
            # TextractFeatures.FORMS,
        ],
        save_image=False,
    )
    text = ""
    for page in document.pages:
        text += page.get_text()
    return text


def extract_text_from_pdf(file_path, file_name):
    try:
        upload_to_s3(file_path, file_name)
        return analyze_pdf(file_name=file_name)
    except Exception as e:
        print("Error extracting text from PDF:", e)