zliang commited on
Commit
8fcd0f0
1 Parent(s): 5822e62

Upload 6 files

Browse files
Base-RCNN-FPN.yml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ MASK_ON: True
3
+ META_ARCHITECTURE: "GeneralizedRCNN"
4
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
5
+ PIXEL_STD: [58.395, 57.120, 57.375]
6
+ BACKBONE:
7
+ NAME: "build_vit_fpn_backbone"
8
+ VIT:
9
+ OUT_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
10
+ DROP_PATH: 0.1
11
+ IMG_SIZE: [224,224]
12
+ POS_TYPE: "abs"
13
+ FPN:
14
+ IN_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
15
+ ANCHOR_GENERATOR:
16
+ SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
17
+ ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
18
+ RPN:
19
+ IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
20
+ PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
21
+ PRE_NMS_TOPK_TEST: 1000 # Per FPN level
22
+ # Detectron1 uses 2000 proposals per-batch,
23
+ # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
24
+ # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
25
+ POST_NMS_TOPK_TRAIN: 1000
26
+ POST_NMS_TOPK_TEST: 1000
27
+ ROI_HEADS:
28
+ NAME: "StandardROIHeads"
29
+ IN_FEATURES: ["p2", "p3", "p4", "p5"]
30
+ NUM_CLASSES: 5
31
+ ROI_BOX_HEAD:
32
+ NAME: "FastRCNNConvFCHead"
33
+ NUM_FC: 2
34
+ POOLER_RESOLUTION: 7
35
+ ROI_MASK_HEAD:
36
+ NAME: "MaskRCNNConvUpsampleHead"
37
+ NUM_CONV: 4
38
+ POOLER_RESOLUTION: 14
39
+ DATASETS:
40
+ TRAIN: ("publaynet_train",)
41
+ TEST: ("publaynet_val",)
42
+ SOLVER:
43
+ LR_SCHEDULER_NAME: "WarmupCosineLR"
44
+ AMP:
45
+ ENABLED: True
46
+ OPTIMIZER: "ADAMW"
47
+ BACKBONE_MULTIPLIER: 1.0
48
+ CLIP_GRADIENTS:
49
+ ENABLED: True
50
+ CLIP_TYPE: "full_model"
51
+ CLIP_VALUE: 1.0
52
+ NORM_TYPE: 2.0
53
+ WARMUP_FACTOR: 0.01
54
+ BASE_LR: 0.0004
55
+ WEIGHT_DECAY: 0.05
56
+ IMS_PER_BATCH: 32
57
+ INPUT:
58
+ CROP:
59
+ ENABLED: True
60
+ TYPE: "absolute_range"
61
+ SIZE: (384, 600)
62
+ MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
63
+ FORMAT: "RGB"
64
+ DATALOADER:
65
+ FILTER_EMPTY_ANNOTATIONS: False
66
+ VERSION: 2
67
+ AUG:
68
+ DETR: True
69
+ SEED: 42
bibtest.ipynb ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pdf2bib"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "def extract_metadata(file_path):\n",
19
+ " pdfextractdata = pdf2bib.pdf2bib(file_path)\n",
20
+ " #st.write(pdfextractdata)\n",
21
+ " pdfextractdata_metadata = {} if pdfextractdata.get('metadata', {}) is None else pdfextractdata.get('metadata', {})\n",
22
+ "\n",
23
+ " return pdfextractdata_metadata"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 43,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "[pdf2bib]: Trying to extract data to generate the BibTeX entry for the file: boiteau-et-al-2024-relating-molecular-properties-to-the-persistence-of-marine-dissolved-organic-matter-with-liquid.pdf\n",
36
+ "[pdf2bib]: Calling pdf2doi...\n",
37
+ "[pdf2doi]: Trying to retrieve a DOI/identifier for the file: boiteau-et-al-2024-relating-molecular-properties-to-the-persistence-of-marine-dissolved-organic-matter-with-liquid.pdf\n",
38
+ "[pdf2doi]: Method #1: Looking for a valid identifier in the document infos...\n",
39
+ "[pdf2doi]: Validating the possible DOI 10.1021/acs.est.3c08245 via a query to dx.doi.org...\n",
40
+ "[pdf2doi]: The DOI 10.1021/acs.est.3c08245 is validated by dx.doi.org.\n",
41
+ "[pdf2doi]: A valid DOI was found in the document info labelled '/prism:doi'.\n",
42
+ "[pdf2bib]: pdf2doi found a valid identifier for this paper.\n",
43
+ "[pdf2bib]: Parsing the info returned by dx.doi.org...\n",
44
+ "[pdf2bib]: A valid BibTeX entry was generated.\n"
45
+ ]
46
+ }
47
+ ],
48
+ "source": [
49
+ "a = pdf2bib.pdf2bib(\"boiteau-et-al-2024-relating-molecular-properties-to-the-persistence-of-marine-dissolved-organic-matter-with-liquid.pdf\")"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 47,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "data": {
59
+ "text/plain": [
60
+ "'@article{boiteau2024relating,\\n\\ttitle = {Relating Molecular Properties to the Persistence of Marine Dissolved Organic Matter with Liquid Chromatography–Ultrahigh-Resolution Mass Spectrometry},\\n\\tpublisher = {American Chemical Society (ACS)},\\n\\turl = {http://dx.doi.org/10.1021/acs.est.3c08245},\\n\\tdoi = {10.1021/acs.est.3c08245},\\n\\tjournal = {Environmental Science & Technology},\\n\\tyear = {2024},\\n\\tmonth = {2},\\n\\tauthor = {Rene M. Boiteau and Yuri E. Corilo and William R. Kew and Christian Dewey and Maria Cristina Alvarez Rodriguez and Craig A. Carlson and Tim M. Conway}\\n}'"
61
+ ]
62
+ },
63
+ "execution_count": 47,
64
+ "metadata": {},
65
+ "output_type": "execute_result"
66
+ }
67
+ ],
68
+ "source": [
69
+ "a.get(\"bibtex\")"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": []
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 12,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "import bibtexparser"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 9,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "parser = bibtex.Parser()"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 45,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "parser = bibtexparser.bparser.BibTexParser(common_strings=True)\n",
104
+ "bib_database = bibtexparser.loads(a.get(\"bibtex\"), parser=parser)"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 41,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "def format_author_names(authors_str):\n",
114
+ " authors = authors_str.split(' and ')\n",
115
+ " formatted_authors = []\n",
116
+ " for author in authors:\n",
117
+ " parts = author.split()\n",
118
+ " if len(parts) == 2: # Simple case: First Last\n",
119
+ " last, first = parts[1], parts[0]\n",
120
+ " formatted_authors.append(f\"{last}, {first[0]}.\")\n",
121
+ " elif len(parts) > 2: # Handling middle names or initials\n",
122
+ " last = parts[-1]\n",
123
+ " initials = ''.join(f\"{part[0]}.\" for part in parts[:-1])\n",
124
+ " formatted_authors.append(f\"{last}, {initials}\")\n",
125
+ " if len(formatted_authors) > 1:\n",
126
+ " formatted_authors_str = ', '.join(formatted_authors[:-1]) + ', & ' + formatted_authors[-1]\n",
127
+ " else:\n",
128
+ " formatted_authors_str = formatted_authors[0]\n",
129
+ " return formatted_authors_str"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 36,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "def format_apa(entry):\n",
139
+ " author = format_author_names(entry.get('author', ''))\n",
140
+ " year = entry.get('year', '')\n",
141
+ " title = entry.get('title', '')\n",
142
+ " journal = entry.get('journal', '')\n",
143
+ " volume = entry.get('volume', '')\n",
144
+ " issue = entry.get('issue', '')\n",
145
+ " pages = entry.get('page', '').replace('-', '–') # En dash for page range\n",
146
+ " doi = entry.get('doi', '')\n",
147
+ " \n",
148
+ " # Constructing the citation\n",
149
+ " apa_citation = f\"{author} {title}. {journal} {volume}, {pages} ({year}). https://doi.org/{doi}\"\n",
150
+ " return apa_citation"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 46,
156
+ "metadata": {},
157
+ "outputs": [
158
+ {
159
+ "data": {
160
+ "text/plain": [
161
+ "'Boiteau, R.M., Corilo, Y.E., Kew, W.R., Dewey, C., Rodriguez, M.C.A., Carlson, C.A., & Conway, T.M. Relating Molecular Properties to the Persistence of Marine Dissolved Organic Matter with Liquid Chromatography–Ultrahigh-Resolution Mass Spectrometry. Environmental Science & Technology , (2024). https://doi.org/10.1021/acs.est.3c08245'"
162
+ ]
163
+ },
164
+ "execution_count": 46,
165
+ "metadata": {},
166
+ "output_type": "execute_result"
167
+ }
168
+ ],
169
+ "source": [
170
+ "format_apa(bib_database.entries[0])"
171
+ ]
172
+ }
173
+ ],
174
+ "metadata": {
175
+ "kernelspec": {
176
+ "display_name": "Python 3",
177
+ "language": "python",
178
+ "name": "python3"
179
+ },
180
+ "language_info": {
181
+ "codemirror_mode": {
182
+ "name": "ipython",
183
+ "version": 3
184
+ },
185
+ "file_extension": ".py",
186
+ "mimetype": "text/x-python",
187
+ "name": "python",
188
+ "nbconvert_exporter": "python",
189
+ "pygments_lexer": "ipython3",
190
+ "version": "3.10.2"
191
+ }
192
+ },
193
+ "nbformat": 4,
194
+ "nbformat_minor": 2
195
+ }
cascade_dit_base.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base-RCNN-FPN.yml"
2
+ MODEL:
3
+ PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
4
+ PIXEL_STD: [ 127.5, 127.5, 127.5 ]
5
+ WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
6
+ VIT:
7
+ NAME: "dit_base_patch16"
8
+ ROI_HEADS:
9
+ NAME: CascadeROIHeads
10
+ ROI_BOX_HEAD:
11
+ CLS_AGNOSTIC_BBOX_REG: True
12
+ RPN:
13
+ POST_NMS_TOPK_TRAIN: 2000
14
+ SOLVER:
15
+ WARMUP_ITERS: 1000
16
+ IMS_PER_BATCH: 16
17
+ MAX_ITER: 60000
18
+ CHECKPOINT_PERIOD: 2000
19
+ TEST:
20
+ EVAL_PERIOD: 2000
imagesummary_fun.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import requests
3
+ import os
4
+
5
+ from dotenv import load_dotenv
6
+ load_dotenv() # This loads the variables from .env
7
+ openai_api_key = os.getenv('openai_api_key')
8
+ #openai_api_key = os.getenv('openai_api_key')
9
+ # Mock function to simulate image encoding and API call
10
+ def encode_image_to_base64(image_path):
11
+ with open(image_path, "rb") as image_file:
12
+ return base64.b64encode(image_file.read()).decode('utf-8')
13
+
14
+
15
+
16
+ # Function to get summary from OpenAI GPT-4 Vision API
17
+ def get_image_summary(image_path):
18
+ # Encode the selected image
19
+ base64_image = encode_image_to_base64(image_path)
20
+
21
+ # OpenAI API URL and Key
22
+ api_url = "https://api.openai.com/v1/chat/completions"
23
+ #openai_api_key = "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"
24
+
25
+ headers = {
26
+ "Content-Type": "application/json",
27
+ "Authorization": f"Bearer {openai_api_key}"
28
+ }
29
+
30
+ payload = {
31
+ "model": "gpt-4-vision-preview", # Update this if the model name changes
32
+ "messages": [
33
+ {
34
+ "role": "user",
35
+ "content": [
36
+ {
37
+ "type": "image_url",
38
+ "image_url": {
39
+ "url": f"data:image/jpeg;base64,{base64_image}"
40
+ }
41
+ },
42
+ {
43
+ "type": "text",
44
+ "text": "You have provide an explanation for this figure or table. Consider elements like panels, axis, data and labels and etc."
45
+ }
46
+ ]
47
+ }
48
+ ],
49
+ "max_tokens": 1000
50
+ }
51
+
52
+ response = requests.post(api_url, headers=headers, json=payload)
53
+ if response.status_code == 200:
54
+ return response.json()["choices"][0]["message"]["content"]
55
+ else:
56
+ return "Failed to get summary. Please try again."
pdfextract_fun.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ import time
4
+ # Filter warnings about inputs not requiring gradients
5
+ warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")
6
+ warnings.filterwarnings("ignore", message="torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument.")
7
+
8
+ import cv2
9
+ import os
10
+ import fitz # PyMuPDF
11
+ import numpy as np
12
+ import re
13
+ import pytesseract
14
+ from PIL import Image
15
+ from tqdm import tqdm
16
+
17
+ from unilm.dit.object_detection.ditod import add_vit_config
18
+
19
+ from detectron2.config import CfgNode as CN
20
+ from detectron2.config import get_cfg
21
+ from detectron2.utils.visualizer import ColorMode, Visualizer
22
+ from detectron2.data import MetadataCatalog
23
+ from detectron2.engine import DefaultPredictor
24
+
25
+
26
+ # Step 1: instantiate config
27
+ cfg = get_cfg()
28
+ add_vit_config(cfg)
29
+ cfg.merge_from_file("cascade_dit_base.yml")
30
+
31
+ # Step 2: add model weights URL to config
32
+ cfg.MODEL.WEIGHTS = "publaynet_dit-b_cascade.pth"
33
+
34
+ # Step 3: set device
35
+ #cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
36
+ cfg.MODEL.DEVICE = "cuda"
37
+
38
+ # Step 4: define model
39
+ predictor = DefaultPredictor(cfg)
40
+
41
+ def analyze_image(img):
42
+
43
+ md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
44
+ if cfg.DATASETS.TEST[0]=='icdar2019_test':
45
+ md.set(thing_classes=["table"])
46
+ else:
47
+ md.set(thing_classes=["text","title","list","table","figure"])
48
+
49
+ output = predictor(img)["instances"]
50
+ v = Visualizer(img[:, :, ::-1],
51
+ md,
52
+ scale=1.0,
53
+ instance_mode=ColorMode.SEGMENTATION)
54
+ result = v.draw_instance_predictions(output.to("cpu"))
55
+ result_image = result.get_image()[:, :, ::-1]
56
+
57
+ return result_image, output, v
58
+
59
+
60
+
61
+ def convert_pdf_to_jpg(pdf_path, output_folder, zoom_factor=2):
62
+ doc = fitz.open(pdf_path)
63
+ for page_num in range(len(doc)):
64
+ page = doc.load_page(page_num)
65
+
66
+ # Adjust zoom factor for higher resolution
67
+ mat = fitz.Matrix(zoom_factor, zoom_factor) # Create a Matrix with the zoom factor
68
+ pix = page.get_pixmap(matrix=mat) # Render the page using the matrix
69
+
70
+ output_file = f"{output_folder}/page_{page_num}.jpg"
71
+ pix.save(output_file)
72
+
73
+
74
+
75
+ def process_jpeg_images(output_folder):
76
+ for page_num in tqdm(range(len(os.listdir(output_folder))), desc="Processing the pdf"):
77
+ file_path = f"{output_folder}/page_{page_num}.jpg"
78
+ img = cv2.imread(file_path)
79
+ if img is None:
80
+ print(f"Failed to read {file_path}. Skipping.")
81
+ continue
82
+ result_image, output, v = analyze_image(img)
83
+
84
+ # Saving logic
85
+ save_extracted_instances(img, output, page_num,output_folder)
86
+
87
+
88
+
89
+ def save_extracted_instances(img, output, page_num, dest_folder, confidence_threshold=0.8):
90
+ class_names = {
91
+ 0: "text",
92
+ 1: "title",
93
+ 2: "list",
94
+ 3: "table",
95
+ 4: "figure"
96
+ }
97
+
98
+ threshold_value = 0 # Standard deviation threshold
99
+ min_height = 0 # Minimum height threshold
100
+
101
+ instances = output.to("cpu")
102
+ boxes = instances.pred_boxes.tensor.numpy()
103
+ class_ids = instances.pred_classes.tolist()
104
+ scores = instances.scores.tolist() # Get prediction scores
105
+
106
+ image_counter = 1
107
+ for box, class_id, score in zip(boxes, class_ids, scores):
108
+ # Check if the prediction score meets the confidence threshold
109
+ if score >= confidence_threshold:
110
+ class_name = class_names.get(class_id, "unknown")
111
+
112
+ # Save only if class is 'figure' or 'table'
113
+ if class_name in ["figure", "table","text"]:
114
+ x1, y1, x2, y2 = map(int, box)
115
+ cropped_image = img[y1:y2, x1:x2]
116
+
117
+ if np.std(cropped_image) > threshold_value and (y2 - y1) > min_height:
118
+ save_path = os.path.join(dest_folder, f"page_{page_num}_{class_name}_{image_counter}.jpg")
119
+ cv2.imwrite(save_path, cropped_image)
120
+ image_counter += 1
121
+
122
+
123
+ def delete_files_in_folder(folder_path):
124
+ for filename in os.listdir(folder_path):
125
+ file_path = os.path.join(folder_path, filename)
126
+ if os.path.isfile(file_path):
127
+ os.remove(file_path)
128
+
129
+
130
+
131
+ def rename_files_sequentially(folder_path):
132
+ # Regex pattern to match 'page_{page_num}_{class_name}_{image_counter}.jpg'
133
+ pattern = re.compile(r'page_(\d+)_(\w+)_(\d+).jpg', re.IGNORECASE)
134
+
135
+ # List files in the folder
136
+ files = os.listdir(folder_path)
137
+
138
+ # Filter and sort files based on the regex pattern
139
+ sorted_files = sorted(
140
+ [f for f in files if pattern.match(f)],
141
+ key=lambda x: (int(pattern.match(x).group(1)), pattern.match(x).group(2).lower(), int(pattern.match(x).group(3)))
142
+ )
143
+
144
+ # Initialize an empty dictionary for counters
145
+ counters = {}
146
+
147
+ for filename in sorted_files:
148
+ match = pattern.match(filename)
149
+ if match:
150
+ page_num, class_name, _ = match.groups()
151
+ class_name = class_name.lower() # Convert class name to lowercase
152
+
153
+ # Initialize counter for this class if it doesn't exist
154
+ if class_name not in counters:
155
+ counters[class_name] = 1
156
+
157
+ # New filename format: '{class_name}_{sequential_number}.jpg'
158
+ new_filename = f"{class_name}_{counters[class_name]}.jpg"
159
+ counters[class_name] += 1
160
+
161
+ # Rename the file
162
+ os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))
163
+
164
+ #print(f"Renamed '{filename}' to '{new_filename}'")
165
+
166
+
167
+ def ocr_folder(folder_path):
168
+ # Regex pattern to match 'text_{number}.jpg'
169
+ pattern = re.compile(r'text_\d+\.jpg', re.IGNORECASE)
170
+
171
+ # Create a subfolder for the OCR text files
172
+ ocr_text_folder = os.path.join(folder_path, "ocr_results")
173
+ if not os.path.exists(ocr_text_folder):
174
+ os.makedirs(ocr_text_folder)
175
+
176
+ for filename in os.listdir(folder_path):
177
+ if pattern.match(filename):
178
+ image_path = os.path.join(folder_path, filename)
179
+ text = ocr_image(image_path)
180
+
181
+ # Save the OCR result to a text file in the subfolder
182
+ text_file_name = filename.replace('.jpg', '.txt')
183
+ text_file_path = os.path.join(ocr_text_folder, text_file_name)
184
+ with open(text_file_path, 'w') as file:
185
+ file.write(text)
186
+
187
+ #print(f"OCR result for {filename} saved to {text_file_path}\n")
188
+
189
+ def ocr_image(image_path):
190
+ image = Image.open(image_path)
191
+ text = pytesseract.image_to_string(image)
192
+ return text
pdfsummary_fun.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from sklearn.cluster import KMeans
4
+ import numpy as np
5
+ from sklearn.decomposition import PCA
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_core.prompts import ChatPromptTemplate
8
+ from langchain_openai import ChatOpenAI
9
+ import os
10
+
11
+
12
+ from dotenv import load_dotenv
13
+ load_dotenv() # This loads the variables from .env
14
+ openai_api_key = os.getenv('openai_api_key')
15
+
16
+ def pdf_summary(ocr_results_folder):
17
+
18
+
19
+ #openai_api_key = "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"
20
+ loader = DirectoryLoader(ocr_results_folder, glob="**/*.txt", loader_cls=TextLoader)
21
+
22
+ docs = loader.load()
23
+ page_contents = [doc.page_content for doc in docs]
24
+
25
+ embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
26
+ embeddings = embeddings_model.embed_documents(page_contents)
27
+
28
+ X = np.array(embeddings)
29
+ num_clusters = 20
30
+ kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
31
+ pca = PCA(n_components=2)
32
+ X_pca = pca.fit_transform(X)
33
+ centroids = kmeans.cluster_centers_
34
+ centroids_pca = pca.transform(centroids)
35
+
36
+ closest_point_indices = find_closest_point_indices(X, centroids, 1)
37
+ extracted_contents = [page_contents[index[0]] for index in closest_point_indices[:num_clusters]]
38
+
39
+ prompt = ChatPromptTemplate.from_template("Summarize the article based on the texts provided from four aspects: Goal, Method, Results, and Conclusion: {topic}")
40
+ model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=openai_api_key)
41
+ output_parser = StrOutputParser()
42
+
43
+ chain = prompt | model | output_parser
44
+
45
+ results = chain.invoke({"topic": ' '.join(extracted_contents)})
46
+
47
+ return results
48
+
49
+ def find_closest_point_indices(X, centroids, num_points=1):
50
+ closest_indices = []
51
+ for center in centroids:
52
+ # Calculating Euclidean distances from each point in X to the centroid
53
+ distances = np.linalg.norm(X - center, axis=1)
54
+
55
+ # Getting the indices of the closest 'num_points' points
56
+ closest_idx = np.argsort(distances)[:num_points]
57
+
58
+ # Adding the indices of the closest points for this centroid
59
+ closest_indices.append(closest_idx)
60
+
61
+ return closest_indices