Spaces:

dcarpintero
/

minerva

Running

File size: 3,000 Bytes

dda22df
 
 
 
 
 
 
 
 
 
 
 
 
 
3dd5cf3
dda22df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7210215
dda22df
 
 
 
 
3dd5cf3
 
 
dda22df
 
 
 
 
 
 
 
 
 
3dd5cf3
dda22df
 
 
3dd5cf3
 
 
 
dda22df
3dd5cf3
dda22df
 
 
 
 
7210215
dda22df
 
 
 
 
 
3dd5cf3
dda22df
 
7210215
dda22df

import os
from typing import Dict, List, Tuple

from PIL import Image
import pytesseract
import requests
from dotenv import load_dotenv, find_dotenv


class Tools:
    def __init__(self):
        load_dotenv(find_dotenv())

        self.safebrowsing_key = os.getenv("SAFEBROWSING_API_KEY")
        self.base_url = "https://safebrowsing.googleapis.com/v4"
        self.client_id = "minerva"
        self.client_version = "0.1.0"
        self.threat_types = [
            "MALWARE",
            "SOCIAL_ENGINEERING",
            "UNWANTED_SOFTWARE",
            "POTENTIALLY_HARMFUL_APPLICATION"
        ]

    def ocr(self, image_path: str) -> str:
        """Extract text from image using OCR
        """
        try:
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            return f"Error in text extraction: {str(e)}"

    def expand_url(self, url: str) -> str:
        """Expand shortened URL
        """
        try:
            response = requests.head(url, allow_redirects=True)
            return response.url
        except requests.exceptions.RequestException as e:
            return url  # Return original URL if expansion fails
    
    def is_url_safe(self, target_url: str) -> Tuple[str, List[Dict[str, str]]]:
        """Check if URL is safe using Google Safe Browsing API
        """
        if not self.safebrowsing_key:
            raise ValueError("SAFEBROWSING_API_KEY is missing.")
        
        safe_endpoint = f"{self.base_url}/threatMatches:find?key={self.safebrowsing_key}"
        expanded_url = self.expand_url(target_url)

        request_body = {
            "client": {
                "clientId": self.client_id,
                "clientVersion": self.client_version
            },
            "threatInfo": {
                "threatTypes": self.threat_types,
                "platformTypes": ["ANY_PLATFORM"],
                "threatEntryTypes": ["URL"],
                "threatEntries": [
                    {"url": target_url}
                ]
            }
        }

        if expanded_url != target_url:
            request_body["threatInfo"]["threatEntries"].append({"url": expanded_url})
        
        try:
            response = requests.post(safe_endpoint, json=request_body)
            response.raise_for_status()

            result = response.json()
            
            if not result:
                return "Not Flagged", []
            
            threats = []
            if "matches" in result:
                for match in result["matches"]:
                    threats.append({
                        "threat_type": match.get("threatType"),
                        "threat_url": match.get("threat", {}).get("url"),
                    })
            
            return "Flagged", threats
            
        except requests.exceptions.RequestException as e:
            raise Exception(f"Error checking URL safety: {str(e)}")