import os import gradio as gr from transformers import DetrImageProcessor, DetrForObjectDetection from langchain_google_genai.chat_models import ChatGoogleGenerativeAI # Import Gemini from PIL import Image import torch import json import requests # Load credentials (stringified JSON) from environment variable for Gemini credentials_string = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") if not credentials_string: raise ValueError("GOOGLE_APPLICATION_CREDENTIALS is not set in the environment!") # Parse the stringified JSON back to a Python dictionary credentials = json.loads(credentials_string) # Save the credentials to a temporary JSON file (required by Google SDKs) with open("service_account.json", "w") as f: json.dump(credentials, f) # Set the GOOGLE_APPLICATION_CREDENTIALS environment variable to the temporary file os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service_account.json" # Initialize Gemini model (chatbot) llm = ChatGoogleGenerativeAI(model='gemini-1.5-pro') # Initialize DETR model and processor for object detection processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") # Load COCO class labels (from the official COCO dataset) COCO_CLASSES = [ 'airplane', 'apple', 'backpack', 'banana', 'baseball hat', 'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake', 'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch', 'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant', 'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier', 'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife', 'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven', 'parking meter', 'person', 'pizza', 'potted plant', 'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep', 'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball', 'stop sign', 'suitcase', 'surfboard', 'teddy bear', 'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush', 'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase', 'wine glass' ] # Global chat history variable chat_history = [] # Function for chatting with Gemini def chat_with_gemini(message): global chat_history bot_response = llm.predict(message) # This will interact with the Gemini model chat_history.append((message, bot_response)) return chat_history # Function for analyzing the uploaded image def analyze_image(image_path): global chat_history try: # Open and preprocess the image image = Image.open(image_path).convert("RGB") inputs = processor(images=image, return_tensors="pt") # Perform inference with torch.no_grad(): outputs = model(**inputs) # Set a target size for post-processing target_sizes = torch.tensor([image.size[::-1]]) # (height, width) results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0] # Collect detected objects (with no minimum confidence filter) detected_objects = [] for idx, label in enumerate(results["labels"]): # Get the object label based on label index object_name = COCO_CLASSES[label.item()] # Assuming COCO_CLASSES is available score = results["scores"][idx].item() # Confidence score for this detection # Store only objects with a score higher than a threshold (e.g., 0.1) if score > 0.1: detected_objects.append(f"{object_name} (score: {score:.2f})") if detected_objects: bot_response = f"Objects detected: {', '.join(detected_objects)}." else: bot_response = "No objects detected." chat_history.append(("Uploaded an image for analysis", bot_response)) return chat_history except Exception as e: error_msg = f"Error processing the image: {str(e)}" chat_history.append(("Error during image analysis", error_msg)) return chat_history # Build the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Ken Chatbot") gr.Markdown("Ask me anything or upload an image for analysis!") # Chatbot display without "User" or "Bot" labels chatbot = gr.Chatbot(elem_id="chatbot") # User input components msg = gr.Textbox(label="Type your message here...", placeholder="Enter your message...", show_label=False) send_btn = gr.Button("Send") img_upload = gr.Image(type="filepath", label="Upload an image for analysis") # Define interactions def handle_text_message(message): return chat_with_gemini(message) def handle_image_upload(image_path): return analyze_image(image_path) # Set up Gradio components with Enter key for sending msg.submit(handle_text_message, msg, chatbot) send_btn.click(handle_text_message, msg, chatbot) send_btn.click(lambda: "", None, msg) # Clear input field img_upload.change(handle_image_upload, img_upload, chatbot) # Custom CSS for styling without usernames gr.HTML(""" """) # Launch the Gradio interface demo.launch()