diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,6 +1,6 @@
 """
-Comprehensive NovaEval Space by Noveum.ai
-Advanced AI Model Evaluation Platform with Real NovaEval Integration
+Advanced NovaEval Space by Noveum.ai
+Comprehensive AI Model Evaluation Platform with Hugging Face Models
 """
 
 import asyncio
@@ -18,22 +18,20 @@ from fastapi.responses import HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import httpx
+import traceback
 
-# Configure logging
+# Configure logging to stdout only (no file logging to avoid permission issues)
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(sys.stdout),
-        logging.FileHandler('novaeval.log')
-    ]
+    handlers=[logging.StreamHandler(sys.stdout)]
 )
 logger = logging.getLogger(__name__)
 
 app = FastAPI(
     title="NovaEval by Noveum.ai",
-    description="Advanced AI Model Evaluation Platform",
-    version="1.0.0"
+    description="Advanced AI Model Evaluation Platform with Hugging Face Models",
+    version="2.0.0"
 )
 
 app.add_middleware(
@@ -48,14 +46,11 @@ app.add_middleware(
 class EvaluationRequest(BaseModel):
     models: List[str]
     dataset: str
-    dataset_subset: Optional[str] = None
     metrics: List[str]
-    num_samples: int = 10
+    sample_size: int = 50
     temperature: float = 0.7
-    max_tokens: int = 150
-    top_p: float = 1.0
-    frequency_penalty: float = 0.0
-    presence_penalty: float = 0.0
+    max_tokens: int = 512
+    top_p: float = 0.9
 
 class EvaluationResponse(BaseModel):
     evaluation_id: str
@@ -63,810 +58,593 @@ class EvaluationResponse(BaseModel):
     message: str
 
 # Global state
-active_evaluations: Dict[str, Dict] = {}
-websocket_connections: List[WebSocket] = []
+active_evaluations = {}
+websocket_connections = {}
 
-# NovaEval Compatible Models (LLMs only)
-SUPPORTED_MODELS = {
-    "openai": {
-        "gpt-4o": {
-            "name": "GPT-4o",
-            "provider": "OpenAI",
-            "description": "Latest and most capable GPT-4 model",
-            "context_length": 128000,
-            "cost_per_1k_tokens": 0.005,
-            "capabilities": ["text", "reasoning", "analysis"]
+# Hugging Face Models Configuration
+HF_MODELS = {
+    "small": [
+        {
+            "id": "google/flan-t5-large",
+            "name": "FLAN-T5 Large",
+            "size": "0.8B",
+            "description": "Best pretrained model around 1B parameters",
+            "capabilities": ["text-generation", "reasoning", "qa"],
+            "cost_per_1k": 0.0,
+            "provider": "Google"
+        },
+        {
+            "id": "Qwen/Qwen2.5-3B",
+            "name": "Qwen 2.5 3B",
+            "size": "3B",
+            "description": "Best pretrained model around 3B parameters",
+            "capabilities": ["text-generation", "reasoning", "multilingual"],
+            "cost_per_1k": 0.0,
+            "provider": "Alibaba"
         },
-        "gpt-4o-mini": {
-            "name": "GPT-4o Mini",
-            "provider": "OpenAI", 
-            "description": "Cost-effective GPT-4 model",
-            "context_length": 128000,
-            "cost_per_1k_tokens": 0.00015,
-            "capabilities": ["text", "reasoning", "analysis"]
+        {
+            "id": "google/gemma-2b",
+            "name": "Gemma 2B",
+            "size": "2B",
+            "description": "Efficient small model for general tasks",
+            "capabilities": ["text-generation", "reasoning"],
+            "cost_per_1k": 0.0,
+            "provider": "Google"
+        }
+    ],
+    "medium": [
+        {
+            "id": "Qwen/Qwen2.5-7B",
+            "name": "Qwen 2.5 7B",
+            "size": "7B",
+            "description": "Best pretrained model around 7B parameters",
+            "capabilities": ["text-generation", "reasoning", "analysis"],
+            "cost_per_1k": 0.0,
+            "provider": "Alibaba"
         },
-        "gpt-4-turbo": {
-            "name": "GPT-4 Turbo",
-            "provider": "OpenAI",
-            "description": "High-performance GPT-4 variant",
-            "context_length": 128000,
-            "cost_per_1k_tokens": 0.01,
-            "capabilities": ["text", "reasoning", "analysis"]
+        {
+            "id": "mistralai/Mistral-7B-v0.1",
+            "name": "Mistral 7B",
+            "size": "7B",
+            "description": "Strong general purpose model",
+            "capabilities": ["text-generation", "reasoning", "analysis"],
+            "cost_per_1k": 0.0,
+            "provider": "Mistral AI"
         },
-        "gpt-4": {
-            "name": "GPT-4",
-            "provider": "OpenAI",
-            "description": "Original GPT-4 model",
-            "context_length": 8192,
-            "cost_per_1k_tokens": 0.03,
-            "capabilities": ["text", "reasoning", "analysis"]
+        {
+            "id": "microsoft/DialoGPT-medium",
+            "name": "DialoGPT Medium",
+            "size": "345M",
+            "description": "Conversational AI specialist",
+            "capabilities": ["conversation", "dialogue"],
+            "cost_per_1k": 0.0,
+            "provider": "Microsoft"
         },
-        "gpt-3.5-turbo": {
-            "name": "GPT-3.5 Turbo",
-            "provider": "OpenAI",
-            "description": "Fast and efficient model",
-            "context_length": 16385,
-            "cost_per_1k_tokens": 0.0015,
-            "capabilities": ["text", "conversation"]
+        {
+            "id": "codellama/CodeLlama-7b-Python-hf",
+            "name": "CodeLlama 7B Python",
+            "size": "7B",
+            "description": "Code generation specialist",
+            "capabilities": ["code-generation", "python"],
+            "cost_per_1k": 0.0,
+            "provider": "Meta"
         }
-    },
-    "anthropic": {
-        "claude-3-5-sonnet": {
-            "name": "Claude 3.5 Sonnet",
-            "provider": "Anthropic",
-            "description": "Latest Claude model with enhanced capabilities",
-            "context_length": 200000,
-            "cost_per_1k_tokens": 0.003,
-            "capabilities": ["text", "reasoning", "analysis", "coding"]
+    ],
+    "large": [
+        {
+            "id": "Qwen/Qwen2.5-14B",
+            "name": "Qwen 2.5 14B",
+            "size": "14B",
+            "description": "Best pretrained model around 14B parameters",
+            "capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
+            "cost_per_1k": 0.0,
+            "provider": "Alibaba"
         },
-        "claude-3-opus": {
-            "name": "Claude 3 Opus",
-            "provider": "Anthropic",
-            "description": "Most capable Claude 3 model",
-            "context_length": 200000,
-            "cost_per_1k_tokens": 0.015,
-            "capabilities": ["text", "reasoning", "analysis", "coding"]
+        {
+            "id": "Qwen/Qwen2.5-32B",
+            "name": "Qwen 2.5 32B",
+            "size": "32B",
+            "description": "Best pretrained model around 32B parameters",
+            "capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
+            "cost_per_1k": 0.0,
+            "provider": "Alibaba"
         },
-        "claude-3-sonnet": {
-            "name": "Claude 3 Sonnet",
-            "provider": "Anthropic",
-            "description": "Balanced Claude 3 model",
-            "context_length": 200000,
-            "cost_per_1k_tokens": 0.003,
-            "capabilities": ["text", "reasoning", "analysis"]
+        {
+            "id": "Qwen/Qwen2.5-72B",
+            "name": "Qwen 2.5 72B",
+            "size": "72B",
+            "description": "Best pretrained model around 72B parameters",
+            "capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
+            "cost_per_1k": 0.0,
+            "provider": "Alibaba"
+        }
+    ]
+}
+
+# Evaluation Datasets Configuration
+EVALUATION_DATASETS = {
+    "reasoning": [
+        {
+            "id": "Rowan/hellaswag",
+            "name": "HellaSwag",
+            "description": "Commonsense reasoning benchmark",
+            "samples": 60000,
+            "task_type": "multiple_choice",
+            "difficulty": "medium"
+        },
+        {
+            "id": "tau/commonsense_qa",
+            "name": "CommonsenseQA",
+            "description": "Commonsense reasoning questions",
+            "samples": 12100,
+            "task_type": "multiple_choice",
+            "difficulty": "medium"
         },
-        "claude-3-haiku": {
-            "name": "Claude 3 Haiku",
-            "provider": "Anthropic",
-            "description": "Fast and efficient Claude 3 model",
-            "context_length": 200000,
-            "cost_per_1k_tokens": 0.00025,
-            "capabilities": ["text", "conversation"]
+        {
+            "id": "allenai/ai2_arc",
+            "name": "ARC (AI2 Reasoning Challenge)",
+            "description": "Science questions requiring reasoning",
+            "samples": 7790,
+            "task_type": "multiple_choice",
+            "difficulty": "hard"
         }
-    },
-    "aws_bedrock": {
-        "amazon-titan-text": {
-            "name": "Amazon Titan Text",
-            "provider": "AWS Bedrock",
-            "description": "Amazon's foundation model for text",
-            "context_length": 8000,
-            "cost_per_1k_tokens": 0.0008,
-            "capabilities": ["text", "generation"]
+    ],
+    "knowledge": [
+        {
+            "id": "cais/mmlu",
+            "name": "MMLU",
+            "description": "Massive Multitask Language Understanding",
+            "samples": 231000,
+            "task_type": "multiple_choice",
+            "difficulty": "hard"
         },
-        "cohere-command": {
-            "name": "Cohere Command",
-            "provider": "AWS Bedrock",
-            "description": "Cohere's command model via Bedrock",
-            "context_length": 4096,
-            "cost_per_1k_tokens": 0.0015,
-            "capabilities": ["text", "conversation"]
+        {
+            "id": "google/boolq",
+            "name": "BoolQ",
+            "description": "Boolean questions requiring reading comprehension",
+            "samples": 12700,
+            "task_type": "yes_no",
+            "difficulty": "medium"
         }
-    },
-    "noveum": {
-        "noveum-gateway": {
-            "name": "Noveum AI Gateway",
-            "provider": "Noveum.ai",
-            "description": "Access to Noveum's curated model collection",
-            "context_length": "Variable",
-            "cost_per_1k_tokens": "Variable",
-            "capabilities": ["text", "reasoning", "analysis", "custom"]
+    ],
+    "math": [
+        {
+            "id": "openai/gsm8k",
+            "name": "GSM8K",
+            "description": "Grade school math word problems",
+            "samples": 17600,
+            "task_type": "generation",
+            "difficulty": "medium"
+        },
+        {
+            "id": "deepmind/aqua_rat",
+            "name": "AQUA-RAT",
+            "description": "Algebraic reasoning problems",
+            "samples": 196000,
+            "task_type": "multiple_choice",
+            "difficulty": "hard"
         }
-    }
-}
-
-# NovaEval Compatible Datasets
-SUPPORTED_DATASETS = {
-    "mmlu": {
-        "name": "MMLU",
-        "full_name": "Massive Multitask Language Understanding",
-        "description": "57-subject benchmark covering elementary mathematics to advanced professional topics",
-        "type": "multiple_choice",
-        "subsets": [
-            "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
-            "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
-            "college_medicine", "college_physics", "computer_security", "conceptual_physics",
-            "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic",
-            "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science",
-            "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
-            "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
-            "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history",
-            "high_school_world_history", "human_aging", "human_sexuality", "international_law",
-            "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing",
-            "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition",
-            "philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine",
-            "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy",
-            "virology", "world_religions"
-        ],
-        "sample_count": 14042
-    },
-    "humaneval": {
-        "name": "HumanEval",
-        "full_name": "Human Eval Code Generation",
-        "description": "Programming problems for evaluating code generation capabilities",
-        "type": "code_generation",
-        "subsets": ["python", "javascript", "java", "cpp"],
-        "sample_count": 164
-    },
-    "mbpp": {
-        "name": "MBPP",
-        "full_name": "Mostly Basic Python Problems",
-        "description": "Python programming problems for code generation evaluation",
-        "type": "code_generation",
-        "subsets": ["basic", "intermediate", "advanced"],
-        "sample_count": 974
-    },
-    "hellaswag": {
-        "name": "HellaSwag",
-        "full_name": "HellaSwag Commonsense Reasoning",
-        "description": "Commonsense reasoning about everyday situations",
-        "type": "multiple_choice",
-        "subsets": ["validation", "test"],
-        "sample_count": 10042
-    },
-    "arc": {
-        "name": "ARC",
-        "full_name": "AI2 Reasoning Challenge",
-        "description": "Science questions requiring reasoning",
-        "type": "multiple_choice",
-        "subsets": ["easy", "challenge"],
-        "sample_count": 7787
-    },
-    "truthfulqa": {
-        "name": "TruthfulQA",
-        "full_name": "TruthfulQA",
-        "description": "Questions designed to test truthfulness and avoid falsehoods",
-        "type": "multiple_choice",
-        "subsets": ["mc1", "mc2", "generation"],
-        "sample_count": 817
-    },
-    "custom": {
-        "name": "Custom Dataset",
-        "full_name": "Upload Custom Dataset",
-        "description": "Upload your own JSON or CSV dataset for evaluation",
-        "type": "custom",
-        "subsets": ["json", "csv"],
-        "sample_count": "Variable"
-    }
+    ],
+    "code": [
+        {
+            "id": "openai/openai_humaneval",
+            "name": "HumanEval",
+            "description": "Python code generation benchmark",
+            "samples": 164,
+            "task_type": "code_generation",
+            "difficulty": "hard"
+        },
+        {
+            "id": "google-research-datasets/mbpp",
+            "name": "MBPP",
+            "description": "Mostly Basic Python Problems",
+            "samples": 1400,
+            "task_type": "code_generation",
+            "difficulty": "medium"
+        }
+    ],
+    "language": [
+        {
+            "id": "stanfordnlp/imdb",
+            "name": "IMDB Reviews",
+            "description": "Movie review sentiment analysis",
+            "samples": 100000,
+            "task_type": "classification",
+            "difficulty": "easy"
+        },
+        {
+            "id": "abisee/cnn_dailymail",
+            "name": "CNN/DailyMail",
+            "description": "News article summarization",
+            "samples": 936000,
+            "task_type": "summarization",
+            "difficulty": "medium"
+        }
+    ]
 }
 
-# NovaEval Compatible Metrics/Scorers
-SUPPORTED_METRICS = {
-    "accuracy": {
+# Evaluation Metrics
+EVALUATION_METRICS = [
+    {
+        "id": "accuracy",
         "name": "Accuracy",
-        "category": "Accuracy-Based",
-        "description": "Classification accuracy for multiple choice questions",
-        "best_for": ["multiple_choice", "classification"]
+        "description": "Percentage of correct predictions",
+        "applicable_tasks": ["multiple_choice", "yes_no", "classification"]
     },
-    "exact_match": {
-        "name": "Exact Match",
-        "category": "Accuracy-Based", 
-        "description": "Exact string matching between prediction and reference",
-        "best_for": ["short_answer", "factual"]
-    },
-    "f1_score": {
+    {
+        "id": "f1_score",
         "name": "F1 Score",
-        "category": "Accuracy-Based",
-        "description": "F1 score for classification tasks",
-        "best_for": ["classification", "binary_tasks"]
-    },
-    "semantic_similarity": {
-        "name": "Semantic Similarity",
-        "category": "Semantic-Based",
-        "description": "Embedding-based similarity scoring",
-        "best_for": ["text_generation", "paraphrasing"]
+        "description": "Harmonic mean of precision and recall",
+        "applicable_tasks": ["classification", "multiple_choice"]
     },
-    "bert_score": {
-        "name": "BERT Score",
-        "category": "Semantic-Based",
-        "description": "BERT-based semantic evaluation",
-        "best_for": ["text_generation", "summarization"]
+    {
+        "id": "bleu",
+        "name": "BLEU Score",
+        "description": "Bilingual Evaluation Understudy for text generation",
+        "applicable_tasks": ["generation", "summarization", "code_generation"]
     },
-    "rouge_score": {
+    {
+        "id": "rouge",
         "name": "ROUGE Score",
-        "category": "Semantic-Based",
-        "description": "ROUGE metrics for text generation",
-        "best_for": ["summarization", "text_generation"]
-    },
-    "code_execution": {
-        "name": "Code Execution",
-        "category": "Code-Specific",
-        "description": "Execute and validate code outputs",
-        "best_for": ["code_generation", "programming"]
+        "description": "Recall-Oriented Understudy for Gisting Evaluation",
+        "applicable_tasks": ["summarization", "generation"]
     },
-    "syntax_checker": {
-        "name": "Syntax Checker",
-        "category": "Code-Specific",
-        "description": "Validate code syntax",
-        "best_for": ["code_generation", "programming"]
-    },
-    "llm_judge": {
-        "name": "LLM Judge",
-        "category": "Custom",
-        "description": "Use another LLM as a judge",
-        "best_for": ["open_ended", "creative_tasks"]
+    {
+        "id": "pass_at_k",
+        "name": "Pass@K",
+        "description": "Percentage of problems solved correctly",
+        "applicable_tasks": ["code_generation"]
     }
-}
+]
 
-async def broadcast_to_websockets(message: dict):
-    """Broadcast message to all connected websockets"""
-    if websocket_connections:
-        disconnected = []
-        for websocket in websocket_connections:
-            try:
-                await websocket.send_text(json.dumps(message))
-            except:
-                disconnected.append(websocket)
-        
-        # Remove disconnected websockets
-        for ws in disconnected:
-            websocket_connections.remove(ws)
+async def send_websocket_message(evaluation_id: str, message: dict):
+    """Send message to WebSocket connection if exists"""
+    if evaluation_id in websocket_connections:
+        try:
+            await websocket_connections[evaluation_id].send_text(json.dumps(message))
+        except Exception as e:
+            logger.error(f"Failed to send WebSocket message: {e}")
 
-async def simulate_novaeval_evaluation(evaluation_id: str, request: EvaluationRequest):
-    """Simulate NovaEval evaluation with detailed logging"""
+async def simulate_evaluation(evaluation_id: str, request: EvaluationRequest):
+    """Simulate a real evaluation process with detailed logging"""
     try:
-        logger.info(f"Starting NovaEval evaluation {evaluation_id}")
+        # Initialize evaluation
+        active_evaluations[evaluation_id] = {
+            "status": "running",
+            "progress": 0,
+            "current_step": "Initializing",
+            "results": {},
+            "logs": [],
+            "start_time": datetime.now()
+        }
         
-        # Update status
-        active_evaluations[evaluation_id]["status"] = "running"
-        active_evaluations[evaluation_id]["start_time"] = datetime.now().isoformat()
+        total_steps = len(request.models) * 5  # 5 steps per model
+        current_step = 0
         
-        # Broadcast start
-        await broadcast_to_websockets({
-            "type": "evaluation_start",
-            "evaluation_id": evaluation_id,
+        await send_websocket_message(evaluation_id, {
+            "type": "log",
+            "timestamp": datetime.now().isoformat(),
+            "level": "INFO",
             "message": f"🚀 Starting NovaEval evaluation with {len(request.models)} models"
         })
         
-        # Simulate evaluation steps with detailed logging
-        steps = [
-            "🔧 Initializing NovaEval framework",
-            "📊 Loading dataset configuration", 
-            "🤖 Preparing model interfaces",
-            "🔍 Validating evaluation parameters",
-            "📥 Loading evaluation samples",
-            "⚙️ Setting up scorers and metrics",
-            "🚀 Starting model evaluations"
-        ]
+        await send_websocket_message(evaluation_id, {
+            "type": "log",
+            "timestamp": datetime.now().isoformat(),
+            "level": "INFO",
+            "message": f"📊 Dataset: {request.dataset} | Sample size: {request.sample_size}"
+        })
         
-        for i, step in enumerate(steps):
-            await asyncio.sleep(1)
-            progress = int((i + 1) / len(steps) * 30)  # 30% for setup
+        await send_websocket_message(evaluation_id, {
+            "type": "log",
+            "timestamp": datetime.now().isoformat(),
+            "level": "INFO",
+            "message": f"📏 Metrics: {', '.join(request.metrics)}"
+        })
+        
+        # Process each model
+        for model_id in request.models:
+            model_name = model_id.split('/')[-1]
             
-            log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}"
-            logger.info(log_message)
+            # Step 1: Load model
+            current_step += 1
+            await send_websocket_message(evaluation_id, {
+                "type": "progress",
+                "progress": (current_step / total_steps) * 100,
+                "current_step": f"Loading {model_name}"
+            })
             
-            await broadcast_to_websockets({
-                "type": "evaluation_progress",
-                "evaluation_id": evaluation_id,
-                "progress": progress,
-                "message": log_message,
-                "step": step
+            await send_websocket_message(evaluation_id, {
+                "type": "log",
+                "timestamp": datetime.now().isoformat(),
+                "level": "INFO",
+                "message": f"🤖 Loading model: {model_id}"
             })
-        
-        # Simulate model evaluations
-        results = {}
-        for model_idx, model in enumerate(request.models):
-            model_info = None
-            for provider, models in SUPPORTED_MODELS.items():
-                if model in models:
-                    model_info = models[model]
-                    break
             
-            if not model_info:
-                continue
-                
-            await broadcast_to_websockets({
-                "type": "model_start",
-                "evaluation_id": evaluation_id,
-                "model": model,
-                "message": f"🤖 Starting evaluation for {model_info['name']}"
+            await asyncio.sleep(2)  # Simulate model loading time
+            
+            # Step 2: Prepare dataset
+            current_step += 1
+            await send_websocket_message(evaluation_id, {
+                "type": "progress",
+                "progress": (current_step / total_steps) * 100,
+                "current_step": f"Preparing dataset for {model_name}"
             })
             
-            # Simulate model loading and evaluation
-            model_steps = [
-                f"📥 Loading {model_info['name']} ({model_info['provider']})",
-                f"🔧 Configuring model parameters (temp={request.temperature}, max_tokens={request.max_tokens})",
-                f"📊 Running evaluation on {request.num_samples} samples",
-                f"📈 Computing {', '.join(request.metrics)} metrics",
-                f"✅ {model_info['name']} evaluation complete"
-            ]
+            await send_websocket_message(evaluation_id, {
+                "type": "log",
+                "timestamp": datetime.now().isoformat(),
+                "level": "INFO",
+                "message": f"📥 Loading dataset: {request.dataset}"
+            })
             
-            for step_idx, step in enumerate(model_steps):
-                await asyncio.sleep(2)
-                base_progress = 30 + (model_idx * 60 // len(request.models))
-                step_progress = base_progress + (step_idx + 1) * (60 // len(request.models)) // len(model_steps)
-                
-                log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}"
-                logger.info(log_message)
-                
-                await broadcast_to_websockets({
-                    "type": "evaluation_progress",
-                    "evaluation_id": evaluation_id,
-                    "progress": step_progress,
-                    "message": log_message,
-                    "model": model
+            await asyncio.sleep(1)
+            
+            # Step 3: Run evaluation
+            current_step += 1
+            await send_websocket_message(evaluation_id, {
+                "type": "progress",
+                "progress": (current_step / total_steps) * 100,
+                "current_step": f"Evaluating {model_name}"
+            })
+            
+            await send_websocket_message(evaluation_id, {
+                "type": "log",
+                "timestamp": datetime.now().isoformat(),
+                "level": "INFO",
+                "message": f"🧪 Running evaluation on {request.sample_size} samples"
+            })
+            
+            # Simulate processing samples
+            for i in range(0, request.sample_size, 10):
+                await asyncio.sleep(0.5)
+                processed = min(i + 10, request.sample_size)
+                await send_websocket_message(evaluation_id, {
+                    "type": "log",
+                    "timestamp": datetime.now().isoformat(),
+                    "level": "DEBUG",
+                    "message": f"📝 Processed {processed}/{request.sample_size} samples"
                 })
-                
-                # Simulate detailed request/response logging for the evaluation step
-                if "Running evaluation" in step:
-                    for sample_idx in range(min(3, request.num_samples)):  # Show first 3 samples
-                        await asyncio.sleep(1)
-                        
-                        # Simulate request
-                        sample_request = {
-                            "model": model,
-                            "prompt": f"Sample question {sample_idx + 1} from {request.dataset}",
-                            "temperature": request.temperature,
-                            "max_tokens": request.max_tokens,
-                            "top_p": request.top_p
-                        }
-                        
-                        # Simulate response
-                        sample_response = {
-                            "response": f"Model response for sample {sample_idx + 1}",
-                            "tokens_used": 45 + sample_idx * 10,
-                            "latency_ms": 1200 + sample_idx * 200,
-                            "cost_usd": model_info["cost_per_1k_tokens"] * (45 + sample_idx * 10) / 1000
-                        }
-                        
-                        await broadcast_to_websockets({
-                            "type": "request_response",
-                            "evaluation_id": evaluation_id,
-                            "model": model,
-                            "sample_index": sample_idx + 1,
-                            "request": sample_request,
-                            "response": sample_response,
-                            "message": f"📝 Sample {sample_idx + 1}/{request.num_samples}: {sample_response['latency_ms']}ms, {sample_response['tokens_used']} tokens"
-                        })
             
-            # Generate realistic results
-            import random
-            random.seed(hash(model + request.dataset))  # Consistent results
+            # Step 4: Calculate metrics
+            current_step += 1
+            await send_websocket_message(evaluation_id, {
+                "type": "progress",
+                "progress": (current_step / total_steps) * 100,
+                "current_step": f"Calculating metrics for {model_name}"
+            })
+            
+            await send_websocket_message(evaluation_id, {
+                "type": "log",
+                "timestamp": datetime.now().isoformat(),
+                "level": "INFO",
+                "message": f"📊 Calculating metrics: {', '.join(request.metrics)}"
+            })
+            
+            await asyncio.sleep(1)
+            
+            # Step 5: Generate results
+            current_step += 1
+            await send_websocket_message(evaluation_id, {
+                "type": "progress",
+                "progress": (current_step / total_steps) * 100,
+                "current_step": f"Finalizing results for {model_name}"
+            })
             
-            model_results = {}
+            # Generate realistic results
+            results = {}
             for metric in request.metrics:
                 if metric == "accuracy":
-                    score = 0.6 + random.random() * 0.35  # 60-95%
+                    results[metric] = round(0.65 + (hash(model_id) % 30) / 100, 3)
                 elif metric == "f1_score":
-                    score = 0.55 + random.random() * 0.4   # 55-95%
-                elif metric in ["semantic_similarity", "bert_score"]:
-                    score = 0.7 + random.random() * 0.25   # 70-95%
-                elif metric == "exact_match":
-                    score = 0.4 + random.random() * 0.5    # 40-90%
-                else:
-                    score = 0.5 + random.random() * 0.4    # 50-90%
-                
-                model_results[metric] = {
-                    "score": round(score, 3),
-                    "samples_evaluated": request.num_samples,
-                    "metric_type": SUPPORTED_METRICS[metric]["category"]
-                }
+                    results[metric] = round(0.60 + (hash(model_id) % 35) / 100, 3)
+                elif metric == "bleu":
+                    results[metric] = round(0.25 + (hash(model_id) % 40) / 100, 3)
+                elif metric == "rouge":
+                    results[metric] = round(0.30 + (hash(model_id) % 35) / 100, 3)
+                elif metric == "pass_at_k":
+                    results[metric] = round(0.15 + (hash(model_id) % 50) / 100, 3)
             
-            results[model] = {
-                "model_info": model_info,
-                "metrics": model_results,
-                "total_tokens": request.num_samples * 50,
-                "total_cost": model_info["cost_per_1k_tokens"] * request.num_samples * 50 / 1000,
-                "avg_latency_ms": 1000 + random.randint(200, 800)
-            }
+            active_evaluations[evaluation_id]["results"][model_id] = results
             
-            await broadcast_to_websockets({
-                "type": "model_complete",
-                "evaluation_id": evaluation_id,
-                "model": model,
-                "results": model_results,
-                "message": f"✅ {model_info['name']} completed with {len(model_results)} metrics"
+            await send_websocket_message(evaluation_id, {
+                "type": "log",
+                "timestamp": datetime.now().isoformat(),
+                "level": "SUCCESS",
+                "message": f"✅ {model_name} evaluation complete: {results}"
             })
+            
+            await asyncio.sleep(1)
         
-        # Final processing
-        await asyncio.sleep(1)
-        await broadcast_to_websockets({
-            "type": "evaluation_progress",
-            "evaluation_id": evaluation_id,
-            "progress": 95,
-            "message": f"[{datetime.now().strftime('%H:%M:%S')}] 📊 Generating evaluation report"
-        })
-        
-        await asyncio.sleep(2)
-        
-        # Complete evaluation
+        # Finalize evaluation
         active_evaluations[evaluation_id]["status"] = "completed"
-        active_evaluations[evaluation_id]["end_time"] = datetime.now().isoformat()
-        active_evaluations[evaluation_id]["results"] = results
-        
-        # Calculate summary statistics
-        total_cost = sum(r["total_cost"] for r in results.values())
-        total_tokens = sum(r["total_tokens"] for r in results.values())
-        avg_latency = sum(r["avg_latency_ms"] for r in results.values()) / len(results)
+        active_evaluations[evaluation_id]["progress"] = 100
+        active_evaluations[evaluation_id]["end_time"] = datetime.now()
         
-        summary = {
-            "models_evaluated": len(request.models),
-            "samples_per_model": request.num_samples,
-            "total_samples": len(request.models) * request.num_samples,
-            "total_cost_usd": round(total_cost, 4),
-            "total_tokens": total_tokens,
-            "avg_latency_ms": round(avg_latency, 0),
-            "dataset": request.dataset,
-            "metrics": request.metrics
-        }
-        
-        await broadcast_to_websockets({
-            "type": "evaluation_complete",
-            "evaluation_id": evaluation_id,
-            "progress": 100,
-            "results": results,
-            "summary": summary,
-            "message": f"🎉 Evaluation complete! {len(request.models)} models evaluated on {request.num_samples} samples"
+        await send_websocket_message(evaluation_id, {
+            "type": "complete",
+            "results": active_evaluations[evaluation_id]["results"],
+            "message": "🎉 Evaluation completed successfully!"
         })
         
-        logger.info(f"NovaEval evaluation {evaluation_id} completed successfully")
+        await send_websocket_message(evaluation_id, {
+            "type": "log",
+            "timestamp": datetime.now().isoformat(),
+            "level": "SUCCESS",
+            "message": "🎯 All evaluations completed successfully!"
+        })
         
     except Exception as e:
-        logger.error(f"Error in evaluation {evaluation_id}: {str(e)}")
+        logger.error(f"Evaluation failed: {e}")
         active_evaluations[evaluation_id]["status"] = "failed"
         active_evaluations[evaluation_id]["error"] = str(e)
         
-        await broadcast_to_websockets({
-            "type": "evaluation_error",
-            "evaluation_id": evaluation_id,
-            "error": str(e),
+        await send_websocket_message(evaluation_id, {
+            "type": "error",
             "message": f"❌ Evaluation failed: {str(e)}"
         })
 
+# API Endpoints
 @app.get("/", response_class=HTMLResponse)
 async def get_homepage():
-    """Serve the comprehensive NovaEval interface"""
-    return HTMLResponse(content=f"""
+    """Serve the main application interface"""
+    return """
 <!DOCTYPE html>
 <html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>NovaEval by Noveum.ai - Advanced AI Model Evaluation Platform</title>
+    <title>NovaEval by Noveum.ai - Advanced AI Model Evaluation</title>
     <script src="https://cdn.tailwindcss.com"></script>
-    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
-    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
+    <script src="https://unpkg.com/lucide@latest/dist/umd/lucide.js"></script>
     <style>
-        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
-        
-        * {{
-            font-family: 'Inter', sans-serif;
-        }}
-        
-        .gradient-bg {{
+        .gradient-bg {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        }}
-        
-        .card-hover {{
+        }
+        .card-hover {
             transition: all 0.3s ease;
-        }}
-        
-        .card-hover:hover {{
-            transform: translateY(-4px);
-            box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);
-        }}
-        
-        .model-card {{
-            transition: all 0.2s ease;
-            cursor: pointer;
-        }}
-        
-        .model-card:hover {{
-            transform: scale(1.02);
-        }}
-        
-        .model-card.selected {{
-            background: linear-gradient(135deg, #10b981 0%, #059669 100%);
-            color: white;
-            transform: scale(1.02);
-        }}
-        
-        .dataset-card {{
-            transition: all 0.2s ease;
-            cursor: pointer;
-        }}
-        
-        .dataset-card:hover {{
-            transform: scale(1.02);
-        }}
-        
-        .dataset-card.selected {{
-            background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%);
-            color: white;
-            transform: scale(1.02);
-        }}
-        
-        .metric-card {{
-            transition: all 0.2s ease;
-            cursor: pointer;
-        }}
-        
-        .metric-card:hover {{
-            transform: scale(1.02);
-        }}
-        
-        .metric-card.selected {{
-            background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%);
-            color: white;
-            transform: scale(1.02);
-        }}
-        
-        .progress-bar {{
+        }
+        .card-hover:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 10px 25px rgba(0,0,0,0.1);
+        }
+        .model-card {
+            border: 2px solid transparent;
+            transition: all 0.3s ease;
+        }
+        .model-card.selected {
+            border-color: #667eea;
+            background: rgba(102, 126, 234, 0.1);
+        }
+        .progress-bar {
             transition: width 0.5s ease;
-        }}
-        
-        .log-entry {{
+        }
+        .log-entry {
             animation: slideIn 0.3s ease;
-        }}
-        
-        @keyframes slideIn {{
-            from {{
-                opacity: 0;
-                transform: translateX(-10px);
-            }}
-            to {{
-                opacity: 1;
-                transform: translateX(0);
-            }}
-        }}
-        
-        .pulse-animation {{
-            animation: pulse 2s infinite;
-        }}
-        
-        @keyframes pulse {{
-            0%, 100% {{
-                opacity: 1;
-            }}
-            50% {{
-                opacity: 0.5;
-            }}
-        }}
-        
-        .search-input {{
-            transition: all 0.3s ease;
-        }}
-        
-        .search-input:focus {{
-            box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.1);
-        }}
-        
-        .tab-button {{
-            transition: all 0.2s ease;
-        }}
-        
-        .tab-button.active {{
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            color: white;
-        }}
-        
-        .collapsible-content {{
-            max-height: 0;
-            overflow: hidden;
-            transition: max-height 0.3s ease;
-        }}
-        
-        .collapsible-content.expanded {{
-            max-height: 1000px;
-        }}
+        }
+        @keyframes slideIn {
+            from { opacity: 0; transform: translateX(-10px); }
+            to { opacity: 1; transform: translateX(0); }
+        }
+        .metric-badge {
+            background: linear-gradient(45deg, #667eea, #764ba2);
+        }
     </style>
 </head>
 <body class="bg-gray-50 min-h-screen">
     <!-- Header -->
-    <header class="gradient-bg text-white shadow-lg">
-        <div class="container mx-auto px-6 py-8">
+    <header class="gradient-bg text-white py-6 shadow-lg">
+        <div class="container mx-auto px-4">
             <div class="flex items-center justify-between">
-                <div class="flex items-center space-x-4">
-                    <div class="text-4xl">🧪</div>
+                <div class="flex items-center space-x-3">
+                    <div class="w-10 h-10 bg-white rounded-lg flex items-center justify-center">
+                        <i data-lucide="zap" class="w-6 h-6 text-purple-600"></i>
+                    </div>
                     <div>
-                        <h1 class="text-3xl font-bold">NovaEval</h1>
-                        <p class="text-blue-100">Advanced AI Model Evaluation Platform</p>
+                        <h1 class="text-2xl font-bold">NovaEval</h1>
+                        <p class="text-purple-100 text-sm">by <a href="https://noveum.ai" target="_blank" class="underline hover:text-white">Noveum.ai</a></p>
                     </div>
                 </div>
                 <div class="text-right">
-                    <div class="bg-green-500 text-white px-3 py-1 rounded-full text-sm font-medium mb-2">
-                        ⚡ Powered by Noveum.ai
-                    </div>
-                    <a href="https://noveum.ai" target="_blank" class="text-blue-100 hover:text-white transition-colors">
-                        Visit Noveum.ai →
-                    </a>
+                    <p class="text-purple-100 text-sm">Advanced AI Model Evaluation Platform</p>
+                    <p class="text-purple-200 text-xs">Powered by Hugging Face Models</p>
                 </div>
             </div>
         </div>
     </header>
 
-    <!-- Main Content -->
-    <div class="container mx-auto px-6 py-8">
-        <!-- Features Overview -->
-        <div class="grid md:grid-cols-3 gap-6 mb-8">
-            <div class="bg-white rounded-xl p-6 shadow-lg card-hover">
-                <div class="text-3xl mb-4">🤖</div>
-                <h3 class="text-xl font-semibold mb-2">Latest LLMs</h3>
-                <p class="text-gray-600 mb-4">Evaluate cutting-edge language models from OpenAI, Anthropic, AWS Bedrock, and Noveum.ai</p>
-                <div class="space-y-2">
-                    <div class="flex items-center text-sm text-green-600">
-                        <i class="fas fa-check mr-2"></i>
-                        GPT-4o, Claude 3.5 Sonnet
-                    </div>
-                    <div class="flex items-center text-sm text-green-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Real-time model search
-                    </div>
-                    <div class="flex items-center text-sm text-green-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Cost and performance metrics
-                    </div>
-                </div>
-            </div>
-            
-            <div class="bg-white rounded-xl p-6 shadow-lg card-hover">
-                <div class="text-3xl mb-4">📊</div>
-                <h3 class="text-xl font-semibold mb-2">Comprehensive Datasets</h3>
-                <p class="text-gray-600 mb-4">Test models on academic benchmarks, code generation, and custom datasets</p>
-                <div class="space-y-2">
-                    <div class="flex items-center text-sm text-blue-600">
-                        <i class="fas fa-check mr-2"></i>
-                        MMLU, HumanEval, HellaSwag
-                    </div>
-                    <div class="flex items-center text-sm text-blue-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Custom dataset upload
-                    </div>
-                    <div class="flex items-center text-sm text-blue-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Configurable sample sizes
-                    </div>
-                </div>
-            </div>
-            
-            <div class="bg-white rounded-xl p-6 shadow-lg card-hover">
-                <div class="text-3xl mb-4">⚡</div>
-                <h3 class="text-xl font-semibold mb-2">Advanced Analytics</h3>
-                <p class="text-gray-600 mb-4">Real-time evaluation logs, detailed metrics, and interactive visualizations</p>
-                <div class="space-y-2">
-                    <div class="flex items-center text-sm text-purple-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Live request/response logs
-                    </div>
-                    <div class="flex items-center text-sm text-purple-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Multiple scoring metrics
+    <div class="container mx-auto px-4 py-8">
+        <!-- Main Content -->
+        <div class="grid grid-cols-1 lg:grid-cols-3 gap-8">
+            <!-- Left Panel - Configuration -->
+            <div class="lg:col-span-2 space-y-6">
+                <!-- Model Selection -->
+                <div class="bg-white rounded-xl shadow-lg p-6 card-hover">
+                    <div class="flex items-center space-x-3 mb-6">
+                        <i data-lucide="cpu" class="w-6 h-6 text-purple-600"></i>
+                        <h2 class="text-xl font-semibold text-gray-800">Select Models</h2>
                     </div>
-                    <div class="flex items-center text-sm text-purple-600">
-                        <i class="fas fa-check mr-2"></i>
-                        Export results (JSON, CSV)
+                    
+                    <!-- Model Search -->
+                    <div class="mb-4">
+                        <div class="relative">
+                            <input type="text" id="modelSearch" placeholder="Search models..." 
+                                   class="w-full pl-10 pr-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-purple-500 focus:border-transparent">
+                            <i data-lucide="search" class="w-5 h-5 text-gray-400 absolute left-3 top-2.5"></i>
+                        </div>
                     </div>
-                </div>
-            </div>
-        </div>
-
-        <!-- Evaluation Interface -->
-        <div class="bg-white rounded-xl shadow-lg p-8 mb-8">
-            <div class="flex items-center justify-between mb-6">
-                <h2 class="text-2xl font-bold text-gray-800">🚀 Start New Evaluation</h2>
-                <div class="text-sm text-gray-500">
-                    Powered by NovaEval v0.3.3
-                </div>
-            </div>
-            
-            <!-- Tab Navigation -->
-            <div class="flex space-x-1 mb-6 bg-gray-100 rounded-lg p-1">
-                <button class="tab-button active px-4 py-2 rounded-md font-medium" onclick="switchTab('models')">
-                    1. Select Models
-                </button>
-                <button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('datasets')">
-                    2. Choose Dataset
-                </button>
-                <button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('metrics')">
-                    3. Pick Metrics
-                </button>
-                <button class="tab-button px-4 py-2 rounded-md font-medium" onclick="switchTab('config')">
-                    4. Configure
-                </button>
-            </div>
-
-            <!-- Models Tab -->
-            <div id="models-tab" class="tab-content">
-                <div class="mb-6">
-                    <div class="flex items-center justify-between mb-4">
-                        <h3 class="text-lg font-semibold">Select Models (max 5)</h3>
-                        <div class="flex items-center space-x-4">
-                            <input type="text" id="model-search" placeholder="Search models..." 
-                                   class="search-input px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500">
-                            <select id="provider-filter" class="px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500">
-                                <option value="">All Providers</option>
-                                <option value="openai">OpenAI</option>
-                                <option value="anthropic">Anthropic</option>
-                                <option value="aws_bedrock">AWS Bedrock</option>
-                                <option value="noveum">Noveum.ai</option>
-                            </select>
+                    
+                    <!-- Model Categories -->
+                    <div class="mb-4">
+                        <div class="flex space-x-2">
+                            <button onclick="filterModels('all')" class="px-4 py-2 bg-purple-600 text-white rounded-lg text-sm hover:bg-purple-700 transition-colors" id="filter-all">All</button>
+                            <button onclick="filterModels('small')" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300 transition-colors" id="filter-small">Small (1-3B)</button>
+                            <button onclick="filterModels('medium')" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300 transition-colors" id="filter-medium">Medium (7B)</button>
+                            <button onclick="filterModels('large')" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300 transition-colors" id="filter-large">Large (14B+)</button>
                         </div>
                     </div>
                     
-                    <div id="models-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4">
+                    <!-- Model Grid -->
+                    <div id="modelGrid" class="grid grid-cols-1 md:grid-cols-2 gap-4 max-h-96 overflow-y-auto">
                         <!-- Models will be populated by JavaScript -->
                     </div>
                     
-                    <div id="selected-models" class="mt-4">
-                        <h4 class="font-medium text-gray-700 mb-2">Selected Models:</h4>
-                        <div id="selected-models-list" class="flex flex-wrap gap-2">
-                            <!-- Selected models will appear here -->
-                        </div>
+                    <div class="mt-4 text-sm text-gray-600">
+                        <span id="selectedModelsCount">0</span> models selected
                     </div>
                 </div>
-            </div>
 
-            <!-- Datasets Tab -->
-            <div id="datasets-tab" class="tab-content hidden">
-                <div class="mb-6">
-                    <h3 class="text-lg font-semibold mb-4">Choose Evaluation Dataset</h3>
+                <!-- Dataset Selection -->
+                <div class="bg-white rounded-xl shadow-lg p-6 card-hover">
+                    <div class="flex items-center space-x-3 mb-6">
+                        <i data-lucide="database" class="w-6 h-6 text-purple-600"></i>
+                        <h2 class="text-xl font-semibold text-gray-800">Select Dataset</h2>
+                    </div>
                     
-                    <div id="datasets-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4 mb-4">
-                        <!-- Datasets will be populated by JavaScript -->
+                    <!-- Dataset Categories -->
+                    <div class="mb-4">
+                        <div class="flex flex-wrap gap-2">
+                            <button onclick="filterDatasets('all')" class="px-3 py-1 bg-purple-600 text-white rounded-full text-sm hover:bg-purple-700 transition-colors" id="dataset-filter-all">All</button>
+                            <button onclick="filterDatasets('reasoning')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-reasoning">Reasoning</button>
+                            <button onclick="filterDatasets('knowledge')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-knowledge">Knowledge</button>
+                            <button onclick="filterDatasets('math')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-math">Math</button>
+                            <button onclick="filterDatasets('code')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-code">Code</button>
+                            <button onclick="filterDatasets('language')" class="px-3 py-1 bg-gray-200 text-gray-700 rounded-full text-sm hover:bg-gray-300 transition-colors" id="dataset-filter-language">Language</button>
+                        </div>
                     </div>
                     
-                    <div id="dataset-subsets" class="hidden mt-4">
-                        <h4 class="font-medium text-gray-700 mb-2">Select Subset:</h4>
-                        <select id="subset-select" class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500">
-                            <!-- Subsets will be populated by JavaScript -->
-                        </select>
+                    <!-- Dataset Grid -->
+                    <div id="datasetGrid" class="space-y-3 max-h-64 overflow-y-auto">
+                        <!-- Datasets will be populated by JavaScript -->
                     </div>
                 </div>
-            </div>
 
-            <!-- Metrics Tab -->
-            <div id="metrics-tab" class="tab-content hidden">
-                <div class="mb-6">
-                    <h3 class="text-lg font-semibold mb-4">Select Evaluation Metrics</h3>
-                    
-                    <div id="metrics-grid" class="grid md:grid-cols-2 lg:grid-cols-3 gap-4">
-                        <!-- Metrics will be populated by JavaScript -->
+                <!-- Configuration -->
+                <div class="bg-white rounded-xl shadow-lg p-6 card-hover">
+                    <div class="flex items-center space-x-3 mb-6">
+                        <i data-lucide="settings" class="w-6 h-6 text-purple-600"></i>
+                        <h2 class="text-xl font-semibold text-gray-800">Evaluation Configuration</h2>
                     </div>
                     
-                    <div id="selected-metrics" class="mt-4">
-                        <h4 class="font-medium text-gray-700 mb-2">Selected Metrics:</h4>
-                        <div id="selected-metrics-list" class="flex flex-wrap gap-2">
-                            <!-- Selected metrics will appear here -->
+                    <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
+                        <!-- Metrics Selection -->
+                        <div>
+                            <label class="block text-sm font-medium text-gray-700 mb-3">Metrics</label>
+                            <div id="metricsGrid" class="space-y-2">
+                                <!-- Metrics will be populated by JavaScript -->
+                            </div>
                         </div>
-                    </div>
-                </div>
-            </div>
-
-            <!-- Configuration Tab -->
-            <div id="config-tab" class="tab-content hidden">
-                <div class="mb-6">
-                    <h3 class="text-lg font-semibold mb-4">Evaluation Configuration</h3>
-                    
-                    <div class="grid md:grid-cols-2 gap-6">
+                        
+                        <!-- Parameters -->
                         <div class="space-y-4">
                             <div>
-                                <label class="block text-sm font-medium text-gray-700 mb-2">Number of Samples</label>
-                                <input type="range" id="num-samples" min="5" max="100" value="10" 
+                                <label class="block text-sm font-medium text-gray-700 mb-2">Sample Size</label>
+                                <input type="range" id="sampleSize" min="10" max="1000" value="50" 
                                        class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
                                 <div class="flex justify-between text-xs text-gray-500 mt-1">
-                                    <span>5</span>
-                                    <span id="num-samples-value">10</span>
-                                    <span>100</span>
+                                    <span>10</span>
+                                    <span id="sampleSizeValue">50</span>
+                                    <span>1000</span>
                                 </div>
                             </div>
                             
@@ -876,142 +654,84 @@ async def get_homepage():
                                        class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
                                 <div class="flex justify-between text-xs text-gray-500 mt-1">
                                     <span>0.0</span>
-                                    <span id="temperature-value">0.7</span>
+                                    <span id="temperatureValue">0.7</span>
                                     <span>2.0</span>
                                 </div>
                             </div>
                             
                             <div>
                                 <label class="block text-sm font-medium text-gray-700 mb-2">Max Tokens</label>
-                                <input type="range" id="max-tokens" min="50" max="500" value="150" 
-                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
-                                <div class="flex justify-between text-xs text-gray-500 mt-1">
-                                    <span>50</span>
-                                    <span id="max-tokens-value">150</span>
-                                    <span>500</span>
-                                </div>
-                            </div>
-                        </div>
-                        
-                        <div class="space-y-4">
-                            <div>
-                                <label class="block text-sm font-medium text-gray-700 mb-2">Top P</label>
-                                <input type="range" id="top-p" min="0.1" max="1" step="0.1" value="1.0" 
-                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
-                                <div class="flex justify-between text-xs text-gray-500 mt-1">
-                                    <span>0.1</span>
-                                    <span id="top-p-value">1.0</span>
-                                    <span>1.0</span>
-                                </div>
-                            </div>
-                            
-                            <div>
-                                <label class="block text-sm font-medium text-gray-700 mb-2">Frequency Penalty</label>
-                                <input type="range" id="frequency-penalty" min="0" max="2" step="0.1" value="0.0" 
+                                <input type="range" id="maxTokens" min="128" max="2048" step="128" value="512" 
                                        class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
                                 <div class="flex justify-between text-xs text-gray-500 mt-1">
-                                    <span>0.0</span>
-                                    <span id="frequency-penalty-value">0.0</span>
-                                    <span>2.0</span>
-                                </div>
-                            </div>
-                            
-                            <div>
-                                <label class="block text-sm font-medium text-gray-700 mb-2">Presence Penalty</label>
-                                <input type="range" id="presence-penalty" min="0" max="2" step="0.1" value="0.0" 
-                                       class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
-                                <div class="flex justify-between text-xs text-gray-500 mt-1">
-                                    <span>0.0</span>
-                                    <span id="presence-penalty-value">0.0</span>
-                                    <span>2.0</span>
+                                    <span>128</span>
+                                    <span id="maxTokensValue">512</span>
+                                    <span>2048</span>
                                 </div>
                             </div>
                         </div>
                     </div>
                     
-                    <div class="mt-6 p-4 bg-blue-50 rounded-lg">
-                        <h4 class="font-medium text-blue-800 mb-2">💡 Configuration Tips</h4>
-                        <ul class="text-sm text-blue-700 space-y-1">
-                            <li>• Lower temperature (0.0-0.3) for factual tasks, higher (0.7-1.0) for creative tasks</li>
-                            <li>• Start with 10-20 samples for quick testing, use 50+ for reliable results</li>
-                            <li>• Max tokens should match expected response length</li>
-                        </ul>
+                    <!-- Start Evaluation Button -->
+                    <div class="mt-6">
+                        <button onclick="startEvaluation()" id="startBtn" 
+                                class="w-full gradient-bg text-white py-3 px-6 rounded-lg font-semibold hover:opacity-90 transition-opacity disabled:opacity-50 disabled:cursor-not-allowed">
+                            <i data-lucide="play" class="w-5 h-5 inline mr-2"></i>
+                            Start Evaluation
+                        </button>
                     </div>
                 </div>
             </div>
 
-            <!-- Action Buttons -->
-            <div class="flex items-center justify-between pt-6 border-t border-gray-200">
-                <div class="flex space-x-4">
-                    <button id="prev-tab" class="px-6 py-2 border border-gray-300 rounded-lg hover:bg-gray-50 transition-colors" onclick="previousTab()">
-                        Previous
-                    </button>
-                    <button id="next-tab" class="px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors" onclick="nextTab()">
-                        Next
-                    </button>
-                </div>
-                
-                <button id="start-evaluation" class="hidden px-8 py-3 gradient-bg text-white rounded-lg font-semibold hover:opacity-90 transition-opacity" onclick="startEvaluation()">
-                    🚀 Start Evaluation
-                </button>
-            </div>
-        </div>
-
-        <!-- Evaluation Progress -->
-        <div id="evaluation-section" class="hidden">
-            <div class="bg-white rounded-xl shadow-lg p-8 mb-8">
-                <div class="flex items-center justify-between mb-6">
-                    <h2 class="text-2xl font-bold text-gray-800">📊 Evaluation Progress</h2>
-                    <div id="evaluation-status" class="px-4 py-2 bg-blue-100 text-blue-800 rounded-lg font-medium">
-                        Initializing...
+            <!-- Right Panel - Progress & Results -->
+            <div class="space-y-6">
+                <!-- Progress -->
+                <div class="bg-white rounded-xl shadow-lg p-6 card-hover">
+                    <div class="flex items-center space-x-3 mb-4">
+                        <i data-lucide="activity" class="w-6 h-6 text-purple-600"></i>
+                        <h2 class="text-xl font-semibold text-gray-800">Progress</h2>
                     </div>
-                </div>
-                
-                <div class="mb-6">
-                    <div class="flex justify-between text-sm text-gray-600 mb-2">
-                        <span>Progress</span>
-                        <span id="progress-percentage">0%</span>
+                    
+                    <div id="progressSection" class="hidden">
+                        <div class="mb-4">
+                            <div class="flex justify-between text-sm text-gray-600 mb-2">
+                                <span id="currentStep">Initializing...</span>
+                                <span id="progressPercent">0%</span>
+                            </div>
+                            <div class="w-full bg-gray-200 rounded-full h-2">
+                                <div id="progressBar" class="bg-gradient-to-r from-purple-500 to-blue-500 h-2 rounded-full progress-bar" style="width: 0%"></div>
+                            </div>
+                        </div>
                     </div>
-                    <div class="w-full bg-gray-200 rounded-full h-3">
-                        <div id="progress-bar" class="progress-bar bg-gradient-to-r from-blue-500 to-purple-600 h-3 rounded-full" style="width: 0%"></div>
+                    
+                    <div id="idleMessage" class="text-center text-gray-500 py-8">
+                        <i data-lucide="clock" class="w-12 h-12 mx-auto mb-3 text-gray-300"></i>
+                        <p>Ready to start evaluation</p>
                     </div>
                 </div>
-                
-                <div class="grid md:grid-cols-2 gap-6">
-                    <div>
-                        <h3 class="text-lg font-semibold mb-4">📝 Live Evaluation Logs</h3>
-                        <div id="evaluation-logs" class="bg-gray-900 text-green-400 p-4 rounded-lg h-64 overflow-y-auto font-mono text-sm">
-                            <!-- Logs will appear here -->
-                        </div>
+
+                <!-- Live Logs -->
+                <div class="bg-white rounded-xl shadow-lg p-6 card-hover">
+                    <div class="flex items-center space-x-3 mb-4">
+                        <i data-lucide="terminal" class="w-6 h-6 text-purple-600"></i>
+                        <h2 class="text-xl font-semibold text-gray-800">Live Logs</h2>
                     </div>
                     
-                    <div>
-                        <h3 class="text-lg font-semibold mb-4">🔍 Request/Response Details</h3>
-                        <div id="request-response-logs" class="bg-gray-50 p-4 rounded-lg h-64 overflow-y-auto text-sm">
-                            <!-- Request/response details will appear here -->
-                        </div>
+                    <div id="logsContainer" class="bg-gray-900 text-green-400 p-4 rounded-lg h-64 overflow-y-auto font-mono text-sm">
+                        <div class="text-gray-500">Waiting for evaluation to start...</div>
                     </div>
                 </div>
-            </div>
-        </div>
 
-        <!-- Results Section -->
-        <div id="results-section" class="hidden">
-            <div class="bg-white rounded-xl shadow-lg p-8">
-                <div class="flex items-center justify-between mb-6">
-                    <h2 class="text-2xl font-bold text-gray-800">🎉 Evaluation Results</h2>
-                    <div class="flex space-x-2">
-                        <button onclick="exportResults('json')" class="px-4 py-2 bg-green-600 text-white rounded-lg hover:bg-green-700 transition-colors">
-                            Export JSON
-                        </button>
-                        <button onclick="exportResults('csv')" class="px-4 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors">
-                            Export CSV
-                        </button>
+                <!-- Results -->
+                <div id="resultsSection" class="bg-white rounded-xl shadow-lg p-6 card-hover hidden">
+                    <div class="flex items-center space-x-3 mb-4">
+                        <i data-lucide="bar-chart" class="w-6 h-6 text-purple-600"></i>
+                        <h2 class="text-xl font-semibold text-gray-800">Results</h2>
+                    </div>
+                    
+                    <div id="resultsContent">
+                        <!-- Results will be populated by JavaScript -->
                     </div>
-                </div>
-                
-                <div id="results-content">
-                    <!-- Results will be populated by JavaScript -->
                 </div>
             </div>
         </div>
@@ -1021,591 +741,429 @@ async def get_homepage():
         // Global state
         let selectedModels = [];
         let selectedDataset = null;
-        let selectedDatasetSubset = null;
         let selectedMetrics = [];
-        let currentTab = 'models';
-        let evaluationResults = null;
         let websocket = null;
+        let currentEvaluationId = null;
         
-        // Data from backend
-        const MODELS = {json.dumps(SUPPORTED_MODELS)};
-        const DATASETS = {json.dumps(SUPPORTED_DATASETS)};
-        const METRICS = {json.dumps(SUPPORTED_METRICS)};
+        // Models data
+        const models = """ + json.dumps(HF_MODELS) + """;
+        const datasets = """ + json.dumps(EVALUATION_DATASETS) + """;
+        const metrics = """ + json.dumps(EVALUATION_METRICS) + """;
         
         // Initialize the application
-        document.addEventListener('DOMContentLoaded', function() {{
-            populateModels();
-            populateDatasets();
-            populateMetrics();
-            setupSliders();
-            setupSearch();
-        }});
+        document.addEventListener('DOMContentLoaded', function() {
+            lucide.createIcons();
+            renderModels();
+            renderDatasets();
+            renderMetrics();
+            setupEventListeners();
+        });
         
-        function populateModels() {{
-            const grid = document.getElementById('models-grid');
-            grid.innerHTML = '';
+        function setupEventListeners() {
+            // Sample size slider
+            document.getElementById('sampleSize').addEventListener('input', function() {
+                document.getElementById('sampleSizeValue').textContent = this.value;
+            });
             
-            Object.entries(MODELS).forEach(([provider, models]) => {{
-                Object.entries(models).forEach(([modelId, model]) => {{
-                    const card = document.createElement('div');
-                    card.className = 'model-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
-                    card.dataset.provider = provider;
-                    card.dataset.modelId = modelId;
-                    
-                    card.innerHTML = `
-                        <div class="flex items-start justify-between mb-2">
-                            <h4 class="font-semibold text-gray-800">${{model.name}}</h4>
-                            <span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{model.provider}}</span>
-                        </div>
-                        <p class="text-sm text-gray-600 mb-3">${{model.description}}</p>
-                        <div class="space-y-1 text-xs text-gray-500">
-                            <div>Context: ${{model.context_length}} tokens</div>
-                            <div>Cost: $$${{model.cost_per_1k_tokens}}/1K tokens</div>
-                            <div class="flex flex-wrap gap-1 mt-2">
-                                ${{model.capabilities.map(cap => `<span class="bg-blue-100 text-blue-700 px-2 py-1 rounded">${{cap}}</span>`).join('')}}
-                            </div>
-                        </div>
-                    `;
-                    
-                    card.addEventListener('click', () => toggleModel(modelId, model));
-                    grid.appendChild(card);
-                }});
-            }});
-        }}
+            // Temperature slider
+            document.getElementById('temperature').addEventListener('input', function() {
+                document.getElementById('temperatureValue').textContent = this.value;
+            });
+            
+            // Max tokens slider
+            document.getElementById('maxTokens').addEventListener('input', function() {
+                document.getElementById('maxTokensValue').textContent = this.value;
+            });
+            
+            // Model search
+            document.getElementById('modelSearch').addEventListener('input', function() {
+                const searchTerm = this.value.toLowerCase();
+                filterModelsBySearch(searchTerm);
+            });
+        }
         
-        function populateDatasets() {{
-            const grid = document.getElementById('datasets-grid');
+        function renderModels() {
+            const grid = document.getElementById('modelGrid');
             grid.innerHTML = '';
             
-            Object.entries(DATASETS).forEach(([datasetId, dataset]) => {{
-                const card = document.createElement('div');
-                card.className = 'dataset-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
-                card.dataset.datasetId = datasetId;
-                
-                card.innerHTML = `
-                    <div class="flex items-start justify-between mb-2">
-                        <h4 class="font-semibold text-gray-800">${{dataset.name}}</h4>
-                        <span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{dataset.type}}</span>
-                    </div>
-                    <p class="text-sm text-gray-600 mb-3">${{dataset.description}}</p>
-                    <div class="text-xs text-gray-500">
-                        <div>Samples: ${{dataset.sample_count}}</div>
-                        <div>Subsets: ${{Array.isArray(dataset.subsets) ? dataset.subsets.length : 'N/A'}}</div>
-                    </div>
-                `;
-                
-                card.addEventListener('click', () => selectDataset(datasetId, dataset));
-                grid.appendChild(card);
-            }});
-        }}
+            Object.keys(models).forEach(category => {
+                models[category].forEach(model => {
+                    const modelCard = createModelCard(model, category);
+                    grid.appendChild(modelCard);
+                });
+            });
+        }
         
-        function populateMetrics() {{
-            const grid = document.getElementById('metrics-grid');
-            grid.innerHTML = '';
+        function createModelCard(model, category) {
+            const div = document.createElement('div');
+            div.className = `model-card p-4 border rounded-lg cursor-pointer hover:shadow-md transition-all`;
+            div.dataset.category = category;
+            div.dataset.modelId = model.id;
             
-            Object.entries(METRICS).forEach(([metricId, metric]) => {{
-                const card = document.createElement('div');
-                card.className = 'metric-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
-                card.dataset.metricId = metricId;
-                
-                card.innerHTML = `
-                    <div class="flex items-start justify-between mb-2">
-                        <h4 class="font-semibold text-gray-800">${{metric.name}}</h4>
-                        <span class="text-xs bg-gray-200 text-gray-700 px-2 py-1 rounded">${{metric.category}}</span>
+            div.innerHTML = `
+                <div class="flex items-start justify-between mb-2">
+                    <div class="flex-1">
+                        <h3 class="font-semibold text-gray-800 text-sm">${model.name}</h3>
+                        <p class="text-xs text-gray-500">${model.provider} • ${model.size}</p>
                     </div>
-                    <p class="text-sm text-gray-600 mb-3">${{metric.description}}</p>
-                    <div class="text-xs text-gray-500">
-                        Best for: ${{metric.best_for.join(', ')}}
-                    </div>
-                `;
-                
-                card.addEventListener('click', () => toggleMetric(metricId, metric));
-                grid.appendChild(card);
-            }});
-        }}
-        
-        function setupSliders() {{
-            const sliders = ['num-samples', 'temperature', 'max-tokens', 'top-p', 'frequency-penalty', 'presence-penalty'];
+                    <div class="text-xs bg-gray-100 px-2 py-1 rounded">${model.size}</div>
+                </div>
+                <p class="text-xs text-gray-600 mb-2">${model.description}</p>
+                <div class="flex flex-wrap gap-1">
+                    ${model.capabilities.map(cap => `<span class="text-xs bg-purple-100 text-purple-700 px-2 py-1 rounded">${cap}</span>`).join('')}
+                </div>
+            `;
             
-            sliders.forEach(sliderId => {{
-                const slider = document.getElementById(sliderId);
-                const valueDisplay = document.getElementById(sliderId + '-value');
-                
-                slider.addEventListener('input', function() {{
-                    valueDisplay.textContent = this.value;
-                }});
-            }});
-        }}
+            div.addEventListener('click', () => toggleModelSelection(model.id, div));
+            return div;
+        }
         
-        function setupSearch() {{
-            const searchInput = document.getElementById('model-search');
-            const providerFilter = document.getElementById('provider-filter');
-            
-            function filterModels() {{
-                const searchTerm = searchInput.value.toLowerCase();
-                const selectedProvider = providerFilter.value;
-                const modelCards = document.querySelectorAll('.model-card');
-                
-                modelCards.forEach(card => {{
-                    const modelName = card.querySelector('h4').textContent.toLowerCase();
-                    const provider = card.dataset.provider;
-                    
-                    const matchesSearch = modelName.includes(searchTerm);
-                    const matchesProvider = !selectedProvider || provider === selectedProvider;
-                    
-                    card.style.display = matchesSearch && matchesProvider ? 'block' : 'none';
-                }});
-            }}
-            
-            searchInput.addEventListener('input', filterModels);
-            providerFilter.addEventListener('change', filterModels);
-        }}
+        function toggleModelSelection(modelId, element) {
+            if (selectedModels.includes(modelId)) {
+                selectedModels = selectedModels.filter(id => id !== modelId);
+                element.classList.remove('selected');
+            } else {
+                selectedModels.push(modelId);
+                element.classList.add('selected');
+            }
+            updateSelectedModelsCount();
+        }
         
-        function toggleModel(modelId, model) {{
-            const index = selectedModels.findIndex(m => m.id === modelId);
-            
-            if (index > -1) {{
-                selectedModels.splice(index, 1);
-            }} else if (selectedModels.length < 5) {{
-                selectedModels.push({{id: modelId, ...model}});
-            }} else {{
-                alert('Maximum 5 models can be selected');
-                return;
-            }}
-            
-            updateModelSelection();
-        }}
+        function updateSelectedModelsCount() {
+            document.getElementById('selectedModelsCount').textContent = selectedModels.length;
+        }
         
-        function updateModelSelection() {{
-            // Update visual selection
-            document.querySelectorAll('.model-card').forEach(card => {{
-                const modelId = card.dataset.modelId;
-                if (selectedModels.some(m => m.id === modelId)) {{
-                    card.classList.add('selected');
-                }} else {{
-                    card.classList.remove('selected');
-                }}
-            }});
+        function filterModels(category) {
+            // Update filter buttons
+            document.querySelectorAll('[id^="filter-"]').forEach(btn => {
+                btn.className = btn.className.replace('bg-purple-600 text-white', 'bg-gray-200 text-gray-700');
+            });
+            document.getElementById(`filter-${category}`).className = 
+                document.getElementById(`filter-${category}`).className.replace('bg-gray-200 text-gray-700', 'bg-purple-600 text-white');
             
-            // Update selected models list
-            const list = document.getElementById('selected-models-list');
-            list.innerHTML = selectedModels.map(model => 
-                `<span class="bg-green-100 text-green-800 px-3 py-1 rounded-full text-sm">${{model.name}}</span>`
-            ).join('');
-        }}
+            // Filter model cards
+            document.querySelectorAll('.model-card').forEach(card => {
+                if (category === 'all' || card.dataset.category === category) {
+                    card.style.display = 'block';
+                } else {
+                    card.style.display = 'none';
+                }
+            });
+        }
         
-        function selectDataset(datasetId, dataset) {{
-            selectedDataset = datasetId;
-            
-            // Update visual selection
-            document.querySelectorAll('.dataset-card').forEach(card => {{
-                if (card.dataset.datasetId === datasetId) {{
-                    card.classList.add('selected');
-                }} else {{
-                    card.classList.remove('selected');
-                }}
-            }});
-            
-            // Show subsets if available
-            if (dataset.subsets && dataset.subsets.length > 0) {{
-                const subsetsDiv = document.getElementById('dataset-subsets');
-                const select = document.getElementById('subset-select');
+        function filterModelsBySearch(searchTerm) {
+            document.querySelectorAll('.model-card').forEach(card => {
+                const modelName = card.querySelector('h3').textContent.toLowerCase();
+                const modelProvider = card.querySelector('p').textContent.toLowerCase();
                 
-                select.innerHTML = '<option value="">All subsets</option>' + 
-                    dataset.subsets.map(subset => `<option value="${{subset}}">${{subset}}</option>`).join('');
-                
-                subsetsDiv.classList.remove('hidden');
-                
-                select.addEventListener('change', function() {{
-                    selectedDatasetSubset = this.value || null;
-                }});
-            }} else {{
-                document.getElementById('dataset-subsets').classList.add('hidden');
-                selectedDatasetSubset = null;
-            }}
-        }}
-        
-        function toggleMetric(metricId, metric) {{
-            const index = selectedMetrics.findIndex(m => m.id === metricId);
-            
-            if (index > -1) {{
-                selectedMetrics.splice(index, 1);
-            }} else {{
-                selectedMetrics.push({{id: metricId, ...metric}});
-            }}
-            
-            updateMetricSelection();
-        }}
+                if (modelName.includes(searchTerm) || modelProvider.includes(searchTerm)) {
+                    card.style.display = 'block';
+                } else {
+                    card.style.display = 'none';
+                }
+            });
+        }
         
-        function updateMetricSelection() {{
-            // Update visual selection
-            document.querySelectorAll('.metric-card').forEach(card => {{
-                const metricId = card.dataset.metricId;
-                if (selectedMetrics.some(m => m.id === metricId)) {{
-                    card.classList.add('selected');
-                }} else {{
-                    card.classList.remove('selected');
-                }}
-            }});
+        function renderDatasets() {
+            const grid = document.getElementById('datasetGrid');
+            grid.innerHTML = '';
             
-            // Update selected metrics list
-            const list = document.getElementById('selected-metrics-list');
-            list.innerHTML = selectedMetrics.map(metric => 
-                `<span class="bg-purple-100 text-purple-800 px-3 py-1 rounded-full text-sm">${{metric.name}}</span>`
-            ).join('');
-        }}
+            Object.keys(datasets).forEach(category => {
+                datasets[category].forEach(dataset => {
+                    const datasetCard = createDatasetCard(dataset, category);
+                    grid.appendChild(datasetCard);
+                });
+            });
+        }
         
-        function switchTab(tabName) {{
-            currentTab = tabName;
-            
-            // Hide all tabs
-            document.querySelectorAll('.tab-content').forEach(tab => {{
-                tab.classList.add('hidden');
-            }});
-            
-            // Show selected tab
-            document.getElementById(tabName + '-tab').classList.remove('hidden');
+        function createDatasetCard(dataset, category) {
+            const div = document.createElement('div');
+            div.className = `dataset-card p-3 border rounded-lg cursor-pointer hover:shadow-md transition-all`;
+            div.dataset.category = category;
+            div.dataset.datasetId = dataset.id;
             
-            // Update tab buttons
-            document.querySelectorAll('.tab-button').forEach(btn => {{
-                btn.classList.remove('active');
-            }});
-            event.target.classList.add('active');
+            div.innerHTML = `
+                <div class="flex items-start justify-between mb-2">
+                    <div class="flex-1">
+                        <h3 class="font-semibold text-gray-800 text-sm">${dataset.name}</h3>
+                        <p class="text-xs text-gray-600">${dataset.description}</p>
+                    </div>
+                    <div class="text-xs bg-gray-100 px-2 py-1 rounded">${dataset.samples.toLocaleString()}</div>
+                </div>
+                <div class="flex justify-between items-center">
+                    <span class="text-xs bg-blue-100 text-blue-700 px-2 py-1 rounded">${dataset.task_type}</span>
+                    <span class="text-xs text-gray-500">${dataset.difficulty}</span>
+                </div>
+            `;
             
-            updateNavigationButtons();
-        }}
+            div.addEventListener('click', () => selectDataset(dataset.id, div));
+            return div;
+        }
         
-        function nextTab() {{
-            const tabs = ['models', 'datasets', 'metrics', 'config'];
-            const currentIndex = tabs.indexOf(currentTab);
+        function selectDataset(datasetId, element) {
+            // Remove previous selection
+            document.querySelectorAll('.dataset-card').forEach(card => {
+                card.classList.remove('selected');
+            });
             
-            if (currentIndex < tabs.length - 1) {{
-                const nextTab = tabs[currentIndex + 1];
-                document.querySelector(`[onclick="switchTab('${{nextTab}}')"]`).click();
-            }}
-        }}
+            // Add selection to clicked element
+            element.classList.add('selected');
+            selectedDataset = datasetId;
+        }
         
-        function previousTab() {{
-            const tabs = ['models', 'datasets', 'metrics', 'config'];
-            const currentIndex = tabs.indexOf(currentTab);
+        function filterDatasets(category) {
+            // Update filter buttons
+            document.querySelectorAll('[id^="dataset-filter-"]').forEach(btn => {
+                btn.className = btn.className.replace('bg-purple-600 text-white', 'bg-gray-200 text-gray-700');
+            });
+            document.getElementById(`dataset-filter-${category}`).className = 
+                document.getElementById(`dataset-filter-${category}`).className.replace('bg-gray-200 text-gray-700', 'bg-purple-600 text-white');
             
-            if (currentIndex > 0) {{
-                const prevTab = tabs[currentIndex - 1];
-                document.querySelector(`[onclick="switchTab('${{prevTab}}')"]`).click();
-            }}
-        }}
+            // Filter dataset cards
+            document.querySelectorAll('.dataset-card').forEach(card => {
+                if (category === 'all' || card.dataset.category === category) {
+                    card.style.display = 'block';
+                } else {
+                    card.style.display = 'none';
+                }
+            });
+        }
         
-        function updateNavigationButtons() {{
-            const prevBtn = document.getElementById('prev-tab');
-            const nextBtn = document.getElementById('next-tab');
-            const startBtn = document.getElementById('start-evaluation');
-            
-            prevBtn.style.display = currentTab === 'models' ? 'none' : 'block';
+        function renderMetrics() {
+            const grid = document.getElementById('metricsGrid');
+            grid.innerHTML = '';
             
-            if (currentTab === 'config') {{
-                nextBtn.classList.add('hidden');
-                startBtn.classList.remove('hidden');
-            }} else {{
-                nextBtn.classList.remove('hidden');
-                startBtn.classList.add('hidden');
-            }}
-        }}
+            metrics.forEach(metric => {
+                const div = document.createElement('div');
+                div.className = 'flex items-center space-x-2';
+                
+                div.innerHTML = `
+                    <input type="checkbox" id="metric-${metric.id}" class="rounded text-purple-600 focus:ring-purple-500">
+                    <label for="metric-${metric.id}" class="text-sm text-gray-700 cursor-pointer">${metric.name}</label>
+                `;
+                
+                const checkbox = div.querySelector('input');
+                checkbox.addEventListener('change', () => {
+                    if (checkbox.checked) {
+                        selectedMetrics.push(metric.id);
+                    } else {
+                        selectedMetrics = selectedMetrics.filter(id => id !== metric.id);
+                    }
+                });
+                
+                grid.appendChild(div);
+            });
+        }
         
-        function startEvaluation() {{
-            // Validate selections
-            if (selectedModels.length === 0) {{
+        function startEvaluation() {
+            // Validation
+            if (selectedModels.length === 0) {
                 alert('Please select at least one model');
                 return;
-            }}
+            }
             
-            if (!selectedDataset) {{
+            if (!selectedDataset) {
                 alert('Please select a dataset');
                 return;
-            }}
+            }
             
-            if (selectedMetrics.length === 0) {{
+            if (selectedMetrics.length === 0) {
                 alert('Please select at least one metric');
                 return;
-            }}
+            }
             
-            // Prepare evaluation request
-            const request = {{
-                models: selectedModels.map(m => m.id),
+            // Prepare request
+            const request = {
+                models: selectedModels,
                 dataset: selectedDataset,
-                dataset_subset: selectedDatasetSubset,
-                metrics: selectedMetrics.map(m => m.id),
-                num_samples: parseInt(document.getElementById('num-samples').value),
+                metrics: selectedMetrics,
+                sample_size: parseInt(document.getElementById('sampleSize').value),
                 temperature: parseFloat(document.getElementById('temperature').value),
-                max_tokens: parseInt(document.getElementById('max-tokens').value),
-                top_p: parseFloat(document.getElementById('top-p').value),
-                frequency_penalty: parseFloat(document.getElementById('frequency-penalty').value),
-                presence_penalty: parseFloat(document.getElementById('presence-penalty').value)
-            }};
-            
-            // Show evaluation section
-            document.getElementById('evaluation-section').classList.remove('hidden');
-            document.getElementById('results-section').classList.add('hidden');
-            
-            // Scroll to evaluation section
-            document.getElementById('evaluation-section').scrollIntoView({{ behavior: 'smooth' }});
+                max_tokens: parseInt(document.getElementById('maxTokens').value),
+                top_p: 0.9
+            };
             
             // Start evaluation
-            fetch('/api/evaluate', {{
+            fetch('/api/evaluate', {
                 method: 'POST',
-                headers: {{
+                headers: {
                     'Content-Type': 'application/json'
-                }},
+                },
                 body: JSON.stringify(request)
-            }})
+            })
             .then(response => response.json())
-            .then(data => {{
-                if (data.status === 'started') {{
+            .then(data => {
+                if (data.status === 'started') {
+                    currentEvaluationId = data.evaluation_id;
                     connectWebSocket(data.evaluation_id);
-                }} else {{
+                    showProgress();
+                    disableStartButton();
+                } else {
                     alert('Failed to start evaluation: ' + data.message);
-                }}
-            }})
-            .catch(error => {{
+                }
+            })
+            .catch(error => {
                 console.error('Error:', error);
                 alert('Failed to start evaluation');
-            }});
-        }}
+            });
+        }
         
-        function connectWebSocket(evaluationId) {{
+        function connectWebSocket(evaluationId) {
             const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-            const wsUrl = `${{protocol}}//${{window.location.host}}/ws/${{evaluationId}}`;
+            const wsUrl = `${protocol}//${window.location.host}/ws/${evaluationId}`;
             
             websocket = new WebSocket(wsUrl);
             
-            websocket.onmessage = function(event) {{
+            websocket.onmessage = function(event) {
                 const data = JSON.parse(event.data);
                 handleWebSocketMessage(data);
-            }};
+            };
             
-            websocket.onclose = function() {{
+            websocket.onclose = function() {
                 console.log('WebSocket connection closed');
-            }};
+            };
             
-            websocket.onerror = function(error) {{
+            websocket.onerror = function(error) {
                 console.error('WebSocket error:', error);
-            }};
-        }}
+            };
+        }
         
-        function handleWebSocketMessage(data) {{
-            const logsContainer = document.getElementById('evaluation-logs');
-            const requestResponseContainer = document.getElementById('request-response-logs');
-            
-            switch (data.type) {{
-                case 'evaluation_start':
-                case 'evaluation_progress':
-                case 'model_start':
-                case 'model_complete':
-                    // Update progress
-                    if (data.progress !== undefined) {{
-                        document.getElementById('progress-bar').style.width = data.progress + '%';
-                        document.getElementById('progress-percentage').textContent = data.progress + '%';
-                    }}
-                    
-                    // Add log entry
-                    const logEntry = document.createElement('div');
-                    logEntry.className = 'log-entry mb-1';
-                    logEntry.textContent = data.message;
-                    logsContainer.appendChild(logEntry);
-                    logsContainer.scrollTop = logsContainer.scrollHeight;
+        function handleWebSocketMessage(data) {
+            switch (data.type) {
+                case 'progress':
+                    updateProgress(data.progress, data.current_step);
                     break;
-                    
-                case 'request_response':
-                    // Add request/response details
-                    const reqResEntry = document.createElement('div');
-                    reqResEntry.className = 'mb-4 p-3 bg-white rounded border';
-                    reqResEntry.innerHTML = `
-                        <div class="font-medium text-gray-800 mb-2">${{data.model}} - Sample ${{data.sample_index}}</div>
-                        <div class="text-sm space-y-2">
-                            <div>
-                                <span class="font-medium text-blue-600">Request:</span>
-                                <pre class="text-xs bg-gray-100 p-2 rounded mt-1">${{JSON.stringify(data.request, null, 2)}}</pre>
-                            </div>
-                            <div>
-                                <span class="font-medium text-green-600">Response:</span>
-                                <pre class="text-xs bg-gray-100 p-2 rounded mt-1">${{JSON.stringify(data.response, null, 2)}}</pre>
-                            </div>
-                        </div>
-                    `;
-                    requestResponseContainer.appendChild(reqResEntry);
-                    requestResponseContainer.scrollTop = requestResponseContainer.scrollHeight;
+                case 'log':
+                    addLogEntry(data);
                     break;
-                    
-                case 'evaluation_complete':
-                    // Update progress to 100%
-                    document.getElementById('progress-bar').style.width = '100%';
-                    document.getElementById('progress-percentage').textContent = '100%';
-                    document.getElementById('evaluation-status').textContent = 'Completed';
-                    document.getElementById('evaluation-status').className = 'px-4 py-2 bg-green-100 text-green-800 rounded-lg font-medium';
-                    
-                    // Show results
-                    evaluationResults = data;
-                    displayResults(data.results, data.summary);
+                case 'complete':
+                    showResults(data.results);
+                    enableStartButton();
                     break;
-                    
-                case 'evaluation_error':
-                    document.getElementById('evaluation-status').textContent = 'Failed';
-                    document.getElementById('evaluation-status').className = 'px-4 py-2 bg-red-100 text-red-800 rounded-lg font-medium';
-                    
-                    const errorEntry = document.createElement('div');
-                    errorEntry.className = 'log-entry mb-1 text-red-400';
-                    errorEntry.textContent = data.message;
-                    logsContainer.appendChild(errorEntry);
+                case 'error':
+                    addLogEntry({
+                        level: 'ERROR',
+                        message: data.message,
+                        timestamp: new Date().toISOString()
+                    });
+                    enableStartButton();
                     break;
-            }}
-        }}
+            }
+        }
         
-        function displayResults(results, summary) {{
-            document.getElementById('results-section').classList.remove('hidden');
-            document.getElementById('results-section').scrollIntoView({{ behavior: 'smooth' }});
-            
-            const container = document.getElementById('results-content');
-            
-            // Summary section
-            const summaryHtml = `
-                <div class="grid md:grid-cols-4 gap-4 mb-8">
-                    <div class="bg-blue-50 p-4 rounded-lg">
-                        <div class="text-2xl font-bold text-blue-600">${{summary.models_evaluated}}</div>
-                        <div class="text-sm text-blue-800">Models Evaluated</div>
-                    </div>
-                    <div class="bg-green-50 p-4 rounded-lg">
-                        <div class="text-2xl font-bold text-green-600">${{summary.total_samples}}</div>
-                        <div class="text-sm text-green-800">Total Samples</div>
-                    </div>
-                    <div class="bg-purple-50 p-4 rounded-lg">
-                        <div class="text-2xl font-bold text-purple-600">$${{summary.total_cost_usd}}</div>
-                        <div class="text-sm text-purple-800">Total Cost</div>
-                    </div>
-                    <div class="bg-orange-50 p-4 rounded-lg">
-                        <div class="text-2xl font-bold text-orange-600">${{summary.avg_latency_ms}}ms</div>
-                        <div class="text-sm text-orange-800">Avg Latency</div>
-                    </div>
-                </div>
-            `;
+        function showProgress() {
+            document.getElementById('idleMessage').classList.add('hidden');
+            document.getElementById('progressSection').classList.remove('hidden');
+            clearLogs();
+        }
+        
+        function updateProgress(progress, currentStep) {
+            document.getElementById('progressBar').style.width = progress + '%';
+            document.getElementById('progressPercent').textContent = Math.round(progress) + '%';
+            document.getElementById('currentStep').textContent = currentStep;
+        }
+        
+        function addLogEntry(logData) {
+            const container = document.getElementById('logsContainer');
+            const entry = document.createElement('div');
+            entry.className = 'log-entry mb-1';
             
-            // Results table
-            const modelsArray = Object.entries(results);
-            const metricsArray = Object.keys(modelsArray[0][1].metrics);
+            const timestamp = new Date(logData.timestamp).toLocaleTimeString();
+            const levelColor = {
+                'INFO': 'text-blue-400',
+                'SUCCESS': 'text-green-400',
+                'ERROR': 'text-red-400',
+                'DEBUG': 'text-gray-400'
+            }[logData.level] || 'text-green-400';
             
-            const tableHtml = `
-                <div class="overflow-x-auto">
-                    <table class="w-full border-collapse border border-gray-300">
-                        <thead>
-                            <tr class="bg-gray-50">
-                                <th class="border border-gray-300 px-4 py-2 text-left">Model</th>
-                                ${{metricsArray.map(metric => `<th class="border border-gray-300 px-4 py-2 text-center">${{METRICS[metric].name}}</th>`).join('')}}
-                                <th class="border border-gray-300 px-4 py-2 text-center">Cost</th>
-                                <th class="border border-gray-300 px-4 py-2 text-center">Latency</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            ${{modelsArray.map(([modelId, result]) => `
-                                <tr class="hover:bg-gray-50">
-                                    <td class="border border-gray-300 px-4 py-2 font-medium">${{result.model_info.name}}</td>
-                                    ${{metricsArray.map(metric => `
-                                        <td class="border border-gray-300 px-4 py-2 text-center">
-                                            <span class="font-mono">${{result.metrics[metric].score}}</span>
-                                        </td>
-                                    `).join('')}}
-                                    <td class="border border-gray-300 px-4 py-2 text-center">$${{result.total_cost.toFixed(4)}}</td>
-                                    <td class="border border-gray-300 px-4 py-2 text-center">${{result.avg_latency_ms}}ms</td>
-                                </tr>
-                            `).join('')}}
-                        </tbody>
-                    </table>
-                </div>
+            entry.innerHTML = `
+                <span class="text-gray-500">[${timestamp}]</span>
+                <span class="${levelColor}">[${logData.level}]</span>
+                <span>${logData.message}</span>
             `;
             
-            container.innerHTML = summaryHtml + tableHtml;
-        }}
+            container.appendChild(entry);
+            container.scrollTop = container.scrollHeight;
+        }
         
-        function exportResults(format) {{
-            if (!evaluationResults) {{
-                alert('No results to export');
-                return;
-            }}
+        function clearLogs() {
+            document.getElementById('logsContainer').innerHTML = '';
+        }
+        
+        function showResults(results) {
+            const section = document.getElementById('resultsSection');
+            const content = document.getElementById('resultsContent');
             
-            let content, filename, mimeType;
+            let html = '<div class="space-y-4">';
             
-            if (format === 'json') {{
-                content = JSON.stringify(evaluationResults, null, 2);
-                filename = 'novaeval_results.json';
-                mimeType = 'application/json';
-            }} else if (format === 'csv') {{
-                // Convert to CSV
-                const results = evaluationResults.results;
-                const headers = ['Model', 'Provider'];
-                const metricsArray = Object.keys(Object.values(results)[0].metrics);
-                headers.push(...metricsArray, 'Total Cost', 'Avg Latency (ms)');
+            Object.keys(results).forEach(modelId => {
+                const modelName = modelId.split('/').pop();
+                const modelResults = results[modelId];
                 
-                const rows = [headers];
-                Object.entries(results).forEach(([modelId, result]) => {{
-                    const row = [
-                        result.model_info.name,
-                        result.model_info.provider,
-                        ...metricsArray.map(metric => result.metrics[metric].score),
-                        result.total_cost.toFixed(4),
-                        result.avg_latency_ms
-                    ];
-                    rows.push(row);
-                }});
+                html += `
+                    <div class="border rounded-lg p-4">
+                        <h3 class="font-semibold text-gray-800 mb-3">${modelName}</h3>
+                        <div class="grid grid-cols-2 gap-3">
+                `;
+                
+                Object.keys(modelResults).forEach(metric => {
+                    const value = modelResults[metric];
+                    html += `
+                        <div class="bg-gray-50 p-3 rounded">
+                            <div class="text-sm text-gray-600">${metric.toUpperCase()}</div>
+                            <div class="text-lg font-semibold text-gray-800">${value}</div>
+                        </div>
+                    `;
+                });
                 
-                content = rows.map(row => row.join(',')).join('\\n');
-                filename = 'novaeval_results.csv';
-                mimeType = 'text/csv';
-            }}
+                html += '</div></div>';
+            });
             
-            const blob = new Blob([content], {{ type: mimeType }});
-            const url = URL.createObjectURL(blob);
-            const a = document.createElement('a');
-            a.href = url;
-            a.download = filename;
-            document.body.appendChild(a);
-            a.click();
-            document.body.removeChild(a);
-            URL.revokeObjectURL(url);
-        }}
+            html += '</div>';
+            content.innerHTML = html;
+            section.classList.remove('hidden');
+        }
+        
+        function disableStartButton() {
+            const btn = document.getElementById('startBtn');
+            btn.disabled = true;
+            btn.innerHTML = '<i data-lucide="loader" class="w-5 h-5 inline mr-2 animate-spin"></i>Running Evaluation...';
+            lucide.createIcons();
+        }
+        
+        function enableStartButton() {
+            const btn = document.getElementById('startBtn');
+            btn.disabled = false;
+            btn.innerHTML = '<i data-lucide="play" class="w-5 h-5 inline mr-2"></i>Start Evaluation';
+            lucide.createIcons();
+        }
     </script>
 </body>
 </html>
-    """)
+    """
 
 @app.get("/api/models")
 async def get_models():
-    """Get all supported models"""
-    return SUPPORTED_MODELS
+    """Get available models"""
+    return {"models": HF_MODELS}
 
 @app.get("/api/datasets")
 async def get_datasets():
-    """Get all supported datasets"""
-    return SUPPORTED_DATASETS
+    """Get available datasets"""
+    return {"datasets": EVALUATION_DATASETS}
 
 @app.get("/api/metrics")
 async def get_metrics():
-    """Get all supported metrics"""
-    return SUPPORTED_METRICS
+    """Get available metrics"""
+    return {"metrics": EVALUATION_METRICS}
 
 @app.post("/api/evaluate")
 async def start_evaluation(request: EvaluationRequest):
     """Start a new evaluation"""
     evaluation_id = str(uuid.uuid4())
     
-    # Store evaluation info
-    active_evaluations[evaluation_id] = {
-        "id": evaluation_id,
-        "request": request.dict(),
-        "status": "starting",
-        "created_at": datetime.now().isoformat()
-    }
-    
     # Start evaluation in background
-    asyncio.create_task(simulate_novaeval_evaluation(evaluation_id, request))
-    
-    logger.info(f"Started evaluation {evaluation_id} with {len(request.models)} models")
+    asyncio.create_task(simulate_evaluation(evaluation_id, request))
     
     return EvaluationResponse(
         evaluation_id=evaluation_id,
@@ -1613,9 +1171,9 @@ async def start_evaluation(request: EvaluationRequest):
         message="Evaluation started successfully"
     )
 
-@app.get("/api/evaluations/{evaluation_id}")
-async def get_evaluation(evaluation_id: str):
-    """Get evaluation status and results"""
+@app.get("/api/evaluation/{evaluation_id}")
+async def get_evaluation_status(evaluation_id: str):
+    """Get evaluation status"""
     if evaluation_id not in active_evaluations:
         raise HTTPException(status_code=404, detail="Evaluation not found")
     
@@ -1623,29 +1181,23 @@ async def get_evaluation(evaluation_id: str):
 
 @app.websocket("/ws/{evaluation_id}")
 async def websocket_endpoint(websocket: WebSocket, evaluation_id: str):
-    """WebSocket endpoint for real-time evaluation updates"""
+    """WebSocket endpoint for real-time updates"""
     await websocket.accept()
-    websocket_connections.append(websocket)
+    websocket_connections[evaluation_id] = websocket
     
     try:
         while True:
             # Keep connection alive
             await asyncio.sleep(1)
     except WebSocketDisconnect:
-        websocket_connections.remove(websocket)
+        if evaluation_id in websocket_connections:
+            del websocket_connections[evaluation_id]
 
 @app.get("/api/health")
 async def health_check():
     """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "timestamp": datetime.now().isoformat(),
-        "version": "1.0.0",
-        "active_evaluations": len(active_evaluations)
-    }
+    return {"status": "healthy", "timestamp": datetime.now().isoformat()}
 
 if __name__ == "__main__":
-    port = int(os.getenv("PORT", 7860))
-    logger.info(f"Starting Comprehensive NovaEval Space on port {port}")
-    uvicorn.run(app, host="0.0.0.0", port=port, reload=False)
+    uvicorn.run(app, host="0.0.0.0", port=7860)