Adnan commited on
Commit
42fbf1f
1 Parent(s): aa13b63

Upload dif.py

Browse files
Files changed (1) hide show
  1. dif.py +319 -0
dif.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import skimage.color
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import cv2
5
+ import os
6
+ import time
7
+ import collections
8
+
9
+ class dif:
10
+
11
+ def __init__(self, directory_A, directory_B=None, similarity="normal", px_size=50, sort_output=False, show_output=False, show_progress=False, delete=False, silent_del=False):
12
+ """
13
+ directory_A (str)......folder path to search for duplicate/similar images
14
+ directory_B (str)......second folder path to search for duplicate/similar images
15
+ similarity (str)......."normal" = searches for duplicates, recommended setting, MSE < 200
16
+ "high" = serached for exact duplicates, extremly sensitive to details, MSE < 0.1
17
+ "low" = searches for similar images, MSE < 1000
18
+ px_size (int)..........recommended not to change default value
19
+ resize images to px_size height x width (in pixels) before being compared
20
+ the higher the pixel size, the more computational ressources and time required
21
+ sort_output (bool).....False = adds the duplicate images to output dictionary in the order they were found
22
+ True = sorts the duplicate images in the output dictionars alphabetically
23
+ show_output (bool).....False = omits the output and doesn't show found images
24
+ True = shows duplicate/similar images found in output
25
+ show_progress (bool)...False = shows where your lengthy processing currently is
26
+ delete (bool)..........! please use with care, as this cannot be undone
27
+ lower resolution duplicate images that were found are automatically deleted
28
+ silent_del (bool)......! please use with care, as this cannot be undone
29
+ True = skips the asking for user confirmation when deleting lower resolution duplicate images
30
+ will only work if "delete" AND "silent_del" are both == True
31
+
32
+ OUTPUT (set)...........a dictionary with the filename of the duplicate images
33
+ and a set of lower resultion images of all duplicates
34
+ """
35
+ start_time = time.time()
36
+ print("DifPy process initializing...", end="\r")
37
+
38
+ if directory_B != None:
39
+ # process both directories
40
+ dif._process_directory(directory_A)
41
+ dif._process_directory(directory_B)
42
+ else:
43
+ # process one directory
44
+ dif._process_directory(directory_A)
45
+ directory_B = directory_A
46
+
47
+ dif._validate_parameters(sort_output, show_output, show_progress, similarity, px_size, delete, silent_del)
48
+
49
+ if directory_B == directory_A:
50
+ result, lower_quality, total = dif._search_one_dir(directory_A,
51
+ similarity, px_size,
52
+ sort_output, show_output, show_progress)
53
+ else:
54
+ result, lower_quality, total = dif._search_two_dirs(directory_A, directory_B,
55
+ similarity, px_size,
56
+ sort_output, show_output, show_progress)
57
+ if sort_output == True:
58
+ result = collections.OrderedDict(sorted(result.items()))
59
+
60
+ end_time = time.time()
61
+ time_elapsed = np.round(end_time - start_time, 4)
62
+ stats = dif._generate_stats(directory_A, directory_B,
63
+ time.localtime(start_time), time.localtime(end_time), time_elapsed,
64
+ similarity, total, len(result))
65
+
66
+ self.result = result
67
+ self.lower_quality = lower_quality
68
+ self.stats = stats
69
+
70
+ if len(result) == 1:
71
+ images = "image"
72
+ else:
73
+ images = "images"
74
+ print("Found", len(result), images, "with one or more duplicate/similar images in", time_elapsed, "seconds.")
75
+
76
+ if len(result) != 0:
77
+ if delete:
78
+ if not silent_del:
79
+ usr = input("Are you sure you want to delete all lower resolution duplicate images? \nThis cannot be undone. (y/n)")
80
+ if str(usr) == "y":
81
+ dif._delete_imgs(set(lower_quality))
82
+ else:
83
+ print("Image deletion canceled.")
84
+ else:
85
+ dif._delete_imgs(set(lower_quality))
86
+
87
+ # Function that searches one directory for duplicate/similar images
88
+ def _search_one_dir(directory_A, similarity="normal", px_size=50, sort_output=False, show_output=False, show_progress=False):
89
+
90
+ img_matrices_A, filenames_A = dif._create_imgs_matrix(directory_A, px_size)
91
+ total = len(img_matrices_A)
92
+ result = {}
93
+ lower_quality = []
94
+
95
+ ref = dif._map_similarity(similarity)
96
+
97
+ # find duplicates/similar images within one folder
98
+ for count_A, imageMatrix_A in enumerate(img_matrices_A):
99
+ if show_progress:
100
+ dif._show_progress(count_A, img_matrices_A)
101
+ for count_B, imageMatrix_B in enumerate(img_matrices_A):
102
+ if count_B > count_A and count_A != len(img_matrices_A):
103
+ rotations = 0
104
+ while rotations <= 3:
105
+ if rotations != 0:
106
+ imageMatrix_B = dif._rotate_img(imageMatrix_B)
107
+
108
+ err = dif._mse(imageMatrix_A, imageMatrix_B)
109
+ if err < ref:
110
+ if show_output:
111
+ dif._show_img_figs(imageMatrix_A, imageMatrix_B, err)
112
+ dif._show_file_info(str("..." + directory_A[-35:]) + "/" + filenames_A[count_A],
113
+ str("..." + directory_A[-35:]) + "/" + filenames_A[count_B])
114
+ if filenames_A[count_A] in result.keys():
115
+ result[filenames_A[count_A]]["duplicates"] = result[filenames_A[count_A]]["duplicates"] + [directory_A + "/" + filenames_A[count_B]]
116
+ else:
117
+ result[filenames_A[count_A]] = {"location": directory_A + "/" + filenames_A[count_A],
118
+ "duplicates": [directory_A + "/" + filenames_A[count_B]]}
119
+ high, low = dif._check_img_quality(directory_A, directory_A, filenames_A[count_A], filenames_A[count_B])
120
+ lower_quality.append(low)
121
+ break
122
+ else:
123
+ rotations += 1
124
+
125
+ if sort_output == True:
126
+ result = collections.OrderedDict(sorted(result.items()))
127
+ return result, lower_quality, total
128
+
129
+ # Function that searches two directories for duplicate/similar images
130
+ def _search_two_dirs(directory_A, directory_B=None, similarity="normal", px_size=50, sort_output=False, show_output=False, show_progress=False):
131
+
132
+ img_matrices_A, filenames_A = dif._create_imgs_matrix(directory_A, px_size)
133
+ img_matrices_B, filenames_B = dif._create_imgs_matrix(directory_B, px_size)
134
+ total = len(img_matrices_A) + len(img_matrices_B)
135
+ result = {}
136
+ lower_quality = []
137
+
138
+ ref = dif._map_similarity(similarity)
139
+
140
+ # find duplicates/similar images between two folders
141
+ for count_A, imageMatrix_A in enumerate(img_matrices_A):
142
+ if show_progress:
143
+ dif._show_progress(count_A, img_matrices_A)
144
+ for count_B, imageMatrix_B in enumerate(img_matrices_B):
145
+ rotations = 0
146
+ while rotations <= 3:
147
+ if rotations != 0:
148
+ imageMatrix_B = dif._rotate_img(imageMatrix_B)
149
+
150
+ err = dif._mse(imageMatrix_A, imageMatrix_B)
151
+ if err < ref:
152
+ if show_output:
153
+ dif._show_img_figs(imageMatrix_A, imageMatrix_B, err)
154
+ dif._show_file_info(str("..." + directory_A[-35:]) + "/" + filenames_A[count_A],
155
+ str("..." + directory_B[-35:]) + "/" + filenames_B[count_B])
156
+ if filenames_A[count_A] in result.keys():
157
+ result[filenames_A[count_A]]["duplicates"] = result[filenames_A[count_A]]["duplicates"] + [directory_B + "/" + filenames_B[count_B]]
158
+ else:
159
+ result[filenames_A[count_A]] = {"location": directory_A + "/" + filenames_A[count_A],
160
+ "duplicates": [directory_B + "/" + filenames_B[count_B]]}
161
+ try:
162
+ high, low = dif._check_img_quality(directory_A, directory_B, filenames_A[count_A], filenames_B[count_B])
163
+ lower_quality.append(low)
164
+ except:
165
+ pass
166
+ break
167
+ else:
168
+ rotations += 1
169
+
170
+ if sort_output == True:
171
+ result = collections.OrderedDict(sorted(result.items()))
172
+
173
+ return result, lower_quality, total
174
+
175
+ # Function that processes the directories that were input as parameters
176
+ def _process_directory(directory):
177
+ # check if directories are valid
178
+ directory += os.sep
179
+ if not os.path.isdir(directory):
180
+ raise FileNotFoundError(f"Directory: " + directory + " does not exist")
181
+ return directory
182
+
183
+ # Function that validates the input parameters of DifPy
184
+ def _validate_parameters(sort_output, show_output, show_progress, similarity, px_size, delete, silent_del):
185
+ # validate the parameters of the function
186
+ if sort_output != True and sort_output != False:
187
+ raise ValueError('Invalid value for "sort_output" parameter.')
188
+ if show_output != True and show_output != False:
189
+ raise ValueError('Invalid value for "show_output" parameter.')
190
+ if show_progress != True and show_progress != False:
191
+ raise ValueError('Invalid value for "show_progress" parameter.')
192
+ if similarity not in ["low", "normal", "high"]:
193
+ raise ValueError('Invalid value for "similarity" parameter.')
194
+ if px_size < 10 or px_size > 5000:
195
+ raise ValueError('Invalid value for "px_size" parameter.')
196
+ if delete != True and delete != False:
197
+ raise ValueError('Invalid value for "delete" parameter.')
198
+ if silent_del != True and silent_del != False:
199
+ raise ValueError('Invalid value for "silent_del" parameter.')
200
+
201
+ # Function that creates a list of matrices for each image found in the folders
202
+ def _create_imgs_matrix(directory, px_size):
203
+ directory = dif._process_directory(directory)
204
+ img_filenames = []
205
+ # create list of all files in directory
206
+ folder_files = [filename for filename in os.listdir(directory)]
207
+
208
+ # create images matrix
209
+ imgs_matrix = []
210
+ for filename in folder_files:
211
+ path = os.path.join(directory, filename)
212
+ # check if the file is not a folder
213
+ if not os.path.isdir(path):
214
+ try:
215
+ img = cv2.imdecode(np.fromfile(
216
+ path, dtype=np.uint8), cv2.IMREAD_UNCHANGED)
217
+ if type(img) == np.ndarray:
218
+ img = img[..., 0:3]
219
+ img = cv2.resize(img, dsize=(
220
+ px_size, px_size), interpolation=cv2.INTER_CUBIC)
221
+
222
+ if len(img.shape) == 2:
223
+ img = skimage.color.gray2rgb(img)
224
+ imgs_matrix.append(img)
225
+ img_filenames.append(filename)
226
+ except:
227
+ pass
228
+ return imgs_matrix, img_filenames
229
+
230
+ # Function that maps the similarity grade to the respective MSE value
231
+ def _map_similarity(similarity):
232
+ if similarity == "low":
233
+ ref = 1000
234
+ # search for exact duplicate images, extremly sensitive, MSE < 0.1
235
+ elif similarity == "high":
236
+ ref = 0.1
237
+ # normal, search for duplicates, recommended, MSE < 200
238
+ else:
239
+ ref = 200
240
+ return ref
241
+
242
+ # Function that calulates the mean squared error (mse) between two image matrices
243
+ def _mse(imageA, imageB):
244
+ err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
245
+ err /= float(imageA.shape[0] * imageA.shape[1])
246
+ return err
247
+
248
+ # Function that plots two compared image files and their mse
249
+ def _show_img_figs(imageA, imageB, err):
250
+ fig = plt.figure()
251
+ plt.suptitle("MSE: %.2f" % (err))
252
+ # plot first image
253
+ ax = fig.add_subplot(1, 2, 1)
254
+ plt.imshow(imageA, cmap=plt.cm.gray)
255
+ plt.axis("off")
256
+ # plot second image
257
+ ax = fig.add_subplot(1, 2, 2)
258
+ plt.imshow(imageB, cmap=plt.cm.gray)
259
+ plt.axis("off")
260
+ # show the images
261
+ plt.show()
262
+
263
+ # Function for printing filename info of plotted image files
264
+ def _show_file_info(imageA, imageB):
265
+ print("""Duplicate files:\n{} and \n{}""".format(imageA, imageB))
266
+
267
+ # Function that displays a progress bar during the search
268
+ def _show_progress(count, img_matrix):
269
+ if count+1 == len(img_matrix):
270
+ print("DifPy processing images: [{}/{}] [{:.0%}]".format(count, len(img_matrix), count/len(img_matrix)), end="\r")
271
+ print("DifPy processing images: [{}/{}] [{:.0%}]".format(count+1, len(img_matrix), (count+1)/len(img_matrix)))
272
+ else:
273
+ print("DifPy processing images: [{}/{}] [{:.0%}]".format(count, len(img_matrix), count/len(img_matrix)), end="\r")
274
+
275
+ # Function for rotating an image matrix by a 90 degree angle
276
+ def _rotate_img(image):
277
+ image = np.rot90(image, k=1, axes=(0, 1))
278
+ return image
279
+
280
+ # Function for checking the quality of compared images, appends the lower quality image to the list
281
+ def _check_img_quality(directoryA, directoryB, imageA, imageB):
282
+ dirA = dif._process_directory(directoryA)
283
+ dirB = dif._process_directory(directoryB)
284
+ size_imgA = os.stat(os.path.join(dirA, imageA)).st_size
285
+ size_imgB = os.stat(os.path.join(dirB, imageB)).st_size
286
+ if size_imgA >= size_imgB:
287
+ return directoryA + "/" + imageA, directoryB + "/" + imageB
288
+ else:
289
+ return directoryB + "/" + imageB, directoryA + "/" + imageA
290
+
291
+ # Function that generates a dictionary for statistics around the completed DifPy process
292
+ def _generate_stats(directoryA, directoryB, start_time, end_time, time_elapsed, similarity, total_searched, total_found):
293
+ stats = {}
294
+ stats["directory_1"] = directoryA
295
+ if directoryB != None:
296
+ stats["directory_2"] = directoryB
297
+ stats["duration"] = {"start_date": time.strftime("%Y-%m-%d", start_time),
298
+ "start_time": time.strftime("%H:%M:%S", start_time),
299
+ "end_date": time.strftime("%Y-%m-%d", end_time),
300
+ "end_time": time.strftime("%H:%M:%S", end_time),
301
+ "seconds_elapsed": time_elapsed}
302
+ stats["similarity_grade"] = similarity
303
+ stats["similarity_mse"] = dif._map_similarity(similarity)
304
+ stats["total_images_searched"] = total_searched
305
+ stats["total_images_found"] = total_found
306
+ return stats
307
+
308
+ # Function for deleting the lower quality images that were found after the search
309
+ def _delete_imgs(lower_quality_set):
310
+ deleted = 0
311
+ for file in lower_quality_set:
312
+ print("\nDeletion in progress...", end="\r")
313
+ try:
314
+ os.remove(file)
315
+ print("Deleted file:", file, end="\r")
316
+ deleted += 1
317
+ except:
318
+ print("Could not delete file:", file, end="\r")
319
+ print("\n***\nDeleted", deleted, "images.")