import numpy as np from tensorflow.keras.applications import ResNet50 from tensorflow.keras.preprocessing import image from tensorflow.keras.applications.resnet50 import preprocess_input from sklearn.metrics.pairwise import cosine_similarity import os # Load the pre-trained ResNet50 model model = ResNet50(weights='imagenet', include_top=False, pooling='avg') # Function to extract feature vector from an image def extract_features(img_path, model): img = image.load_img(img_path, target_size=(224, 224)) img_data = image.img_to_array(img) img_data = np.expand_dims(img_data, axis=0) img_data = preprocess_input(img_data) features = model.predict(img_data) return features.flatten() # Directory containing images image_dir = './forward_facing' # Extract features for all images image_features = {} for img_file in os.listdir(image_dir): img_path = os.path.join(image_dir, img_file) features = extract_features(img_path, model) image_features[img_file] = features # Convert feature dictionary to list for processing feature_list = list(image_features.values()) file_list = list(image_features.keys()) # Calculate similarities num_images = len(file_list) similarity_matrix = np.zeros((num_images, num_images)) for i in range(num_images): for j in range(i, num_images): if i != j: similarity = cosine_similarity( [feature_list[i]], [feature_list[j]] )[0][0] similarity_matrix[i][j] = similarity similarity_matrix[j][i] = similarity # Identify and remove duplicates threshold = 0.9 # Similarity threshold for duplicates duplicates = set() for i in range(num_images): for j in range(i + 1, num_images): if similarity_matrix[i][j] > threshold: duplicates.add(file_list[j]) # Remove duplicates # for duplicate in duplicates: # os.remove(os.path.join(image_dir, duplicate)) print("Duplicate Images No => ", len(duplicates)) # print(f"Removed {len(duplicates)} duplicate images.")