diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -1,6 +1,6 @@
"""
-Comprehensive NovaEval Space by Noveum.ai
-Advanced AI Model Evaluation Platform with Real NovaEval Integration
+Advanced NovaEval Space by Noveum.ai
+Comprehensive AI Model Evaluation Platform with Hugging Face Models
"""
import asyncio
@@ -18,22 +18,20 @@ from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import httpx
+import traceback
-# Configure logging
+# Configure logging to stdout only (no file logging to avoid permission issues)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- handlers=[
- logging.StreamHandler(sys.stdout),
- logging.FileHandler('novaeval.log')
- ]
+ handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
app = FastAPI(
title="NovaEval by Noveum.ai",
- description="Advanced AI Model Evaluation Platform",
- version="1.0.0"
+ description="Advanced AI Model Evaluation Platform with Hugging Face Models",
+ version="2.0.0"
)
app.add_middleware(
@@ -48,14 +46,11 @@ app.add_middleware(
class EvaluationRequest(BaseModel):
models: List[str]
dataset: str
- dataset_subset: Optional[str] = None
metrics: List[str]
- num_samples: int = 10
+ sample_size: int = 50
temperature: float = 0.7
- max_tokens: int = 150
- top_p: float = 1.0
- frequency_penalty: float = 0.0
- presence_penalty: float = 0.0
+ max_tokens: int = 512
+ top_p: float = 0.9
class EvaluationResponse(BaseModel):
evaluation_id: str
@@ -63,810 +58,593 @@ class EvaluationResponse(BaseModel):
message: str
# Global state
-active_evaluations: Dict[str, Dict] = {}
-websocket_connections: List[WebSocket] = []
+active_evaluations = {}
+websocket_connections = {}
-# NovaEval Compatible Models (LLMs only)
-SUPPORTED_MODELS = {
- "openai": {
- "gpt-4o": {
- "name": "GPT-4o",
- "provider": "OpenAI",
- "description": "Latest and most capable GPT-4 model",
- "context_length": 128000,
- "cost_per_1k_tokens": 0.005,
- "capabilities": ["text", "reasoning", "analysis"]
+# Hugging Face Models Configuration
+HF_MODELS = {
+ "small": [
+ {
+ "id": "google/flan-t5-large",
+ "name": "FLAN-T5 Large",
+ "size": "0.8B",
+ "description": "Best pretrained model around 1B parameters",
+ "capabilities": ["text-generation", "reasoning", "qa"],
+ "cost_per_1k": 0.0,
+ "provider": "Google"
+ },
+ {
+ "id": "Qwen/Qwen2.5-3B",
+ "name": "Qwen 2.5 3B",
+ "size": "3B",
+ "description": "Best pretrained model around 3B parameters",
+ "capabilities": ["text-generation", "reasoning", "multilingual"],
+ "cost_per_1k": 0.0,
+ "provider": "Alibaba"
},
- "gpt-4o-mini": {
- "name": "GPT-4o Mini",
- "provider": "OpenAI",
- "description": "Cost-effective GPT-4 model",
- "context_length": 128000,
- "cost_per_1k_tokens": 0.00015,
- "capabilities": ["text", "reasoning", "analysis"]
+ {
+ "id": "google/gemma-2b",
+ "name": "Gemma 2B",
+ "size": "2B",
+ "description": "Efficient small model for general tasks",
+ "capabilities": ["text-generation", "reasoning"],
+ "cost_per_1k": 0.0,
+ "provider": "Google"
+ }
+ ],
+ "medium": [
+ {
+ "id": "Qwen/Qwen2.5-7B",
+ "name": "Qwen 2.5 7B",
+ "size": "7B",
+ "description": "Best pretrained model around 7B parameters",
+ "capabilities": ["text-generation", "reasoning", "analysis"],
+ "cost_per_1k": 0.0,
+ "provider": "Alibaba"
},
- "gpt-4-turbo": {
- "name": "GPT-4 Turbo",
- "provider": "OpenAI",
- "description": "High-performance GPT-4 variant",
- "context_length": 128000,
- "cost_per_1k_tokens": 0.01,
- "capabilities": ["text", "reasoning", "analysis"]
+ {
+ "id": "mistralai/Mistral-7B-v0.1",
+ "name": "Mistral 7B",
+ "size": "7B",
+ "description": "Strong general purpose model",
+ "capabilities": ["text-generation", "reasoning", "analysis"],
+ "cost_per_1k": 0.0,
+ "provider": "Mistral AI"
},
- "gpt-4": {
- "name": "GPT-4",
- "provider": "OpenAI",
- "description": "Original GPT-4 model",
- "context_length": 8192,
- "cost_per_1k_tokens": 0.03,
- "capabilities": ["text", "reasoning", "analysis"]
+ {
+ "id": "microsoft/DialoGPT-medium",
+ "name": "DialoGPT Medium",
+ "size": "345M",
+ "description": "Conversational AI specialist",
+ "capabilities": ["conversation", "dialogue"],
+ "cost_per_1k": 0.0,
+ "provider": "Microsoft"
},
- "gpt-3.5-turbo": {
- "name": "GPT-3.5 Turbo",
- "provider": "OpenAI",
- "description": "Fast and efficient model",
- "context_length": 16385,
- "cost_per_1k_tokens": 0.0015,
- "capabilities": ["text", "conversation"]
+ {
+ "id": "codellama/CodeLlama-7b-Python-hf",
+ "name": "CodeLlama 7B Python",
+ "size": "7B",
+ "description": "Code generation specialist",
+ "capabilities": ["code-generation", "python"],
+ "cost_per_1k": 0.0,
+ "provider": "Meta"
}
- },
- "anthropic": {
- "claude-3-5-sonnet": {
- "name": "Claude 3.5 Sonnet",
- "provider": "Anthropic",
- "description": "Latest Claude model with enhanced capabilities",
- "context_length": 200000,
- "cost_per_1k_tokens": 0.003,
- "capabilities": ["text", "reasoning", "analysis", "coding"]
+ ],
+ "large": [
+ {
+ "id": "Qwen/Qwen2.5-14B",
+ "name": "Qwen 2.5 14B",
+ "size": "14B",
+ "description": "Best pretrained model around 14B parameters",
+ "capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
+ "cost_per_1k": 0.0,
+ "provider": "Alibaba"
},
- "claude-3-opus": {
- "name": "Claude 3 Opus",
- "provider": "Anthropic",
- "description": "Most capable Claude 3 model",
- "context_length": 200000,
- "cost_per_1k_tokens": 0.015,
- "capabilities": ["text", "reasoning", "analysis", "coding"]
+ {
+ "id": "Qwen/Qwen2.5-32B",
+ "name": "Qwen 2.5 32B",
+ "size": "32B",
+ "description": "Best pretrained model around 32B parameters",
+ "capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
+ "cost_per_1k": 0.0,
+ "provider": "Alibaba"
},
- "claude-3-sonnet": {
- "name": "Claude 3 Sonnet",
- "provider": "Anthropic",
- "description": "Balanced Claude 3 model",
- "context_length": 200000,
- "cost_per_1k_tokens": 0.003,
- "capabilities": ["text", "reasoning", "analysis"]
+ {
+ "id": "Qwen/Qwen2.5-72B",
+ "name": "Qwen 2.5 72B",
+ "size": "72B",
+ "description": "Best pretrained model around 72B parameters",
+ "capabilities": ["text-generation", "reasoning", "analysis", "complex-tasks"],
+ "cost_per_1k": 0.0,
+ "provider": "Alibaba"
+ }
+ ]
+}
+
+# Evaluation Datasets Configuration
+EVALUATION_DATASETS = {
+ "reasoning": [
+ {
+ "id": "Rowan/hellaswag",
+ "name": "HellaSwag",
+ "description": "Commonsense reasoning benchmark",
+ "samples": 60000,
+ "task_type": "multiple_choice",
+ "difficulty": "medium"
+ },
+ {
+ "id": "tau/commonsense_qa",
+ "name": "CommonsenseQA",
+ "description": "Commonsense reasoning questions",
+ "samples": 12100,
+ "task_type": "multiple_choice",
+ "difficulty": "medium"
},
- "claude-3-haiku": {
- "name": "Claude 3 Haiku",
- "provider": "Anthropic",
- "description": "Fast and efficient Claude 3 model",
- "context_length": 200000,
- "cost_per_1k_tokens": 0.00025,
- "capabilities": ["text", "conversation"]
+ {
+ "id": "allenai/ai2_arc",
+ "name": "ARC (AI2 Reasoning Challenge)",
+ "description": "Science questions requiring reasoning",
+ "samples": 7790,
+ "task_type": "multiple_choice",
+ "difficulty": "hard"
}
- },
- "aws_bedrock": {
- "amazon-titan-text": {
- "name": "Amazon Titan Text",
- "provider": "AWS Bedrock",
- "description": "Amazon's foundation model for text",
- "context_length": 8000,
- "cost_per_1k_tokens": 0.0008,
- "capabilities": ["text", "generation"]
+ ],
+ "knowledge": [
+ {
+ "id": "cais/mmlu",
+ "name": "MMLU",
+ "description": "Massive Multitask Language Understanding",
+ "samples": 231000,
+ "task_type": "multiple_choice",
+ "difficulty": "hard"
},
- "cohere-command": {
- "name": "Cohere Command",
- "provider": "AWS Bedrock",
- "description": "Cohere's command model via Bedrock",
- "context_length": 4096,
- "cost_per_1k_tokens": 0.0015,
- "capabilities": ["text", "conversation"]
+ {
+ "id": "google/boolq",
+ "name": "BoolQ",
+ "description": "Boolean questions requiring reading comprehension",
+ "samples": 12700,
+ "task_type": "yes_no",
+ "difficulty": "medium"
}
- },
- "noveum": {
- "noveum-gateway": {
- "name": "Noveum AI Gateway",
- "provider": "Noveum.ai",
- "description": "Access to Noveum's curated model collection",
- "context_length": "Variable",
- "cost_per_1k_tokens": "Variable",
- "capabilities": ["text", "reasoning", "analysis", "custom"]
+ ],
+ "math": [
+ {
+ "id": "openai/gsm8k",
+ "name": "GSM8K",
+ "description": "Grade school math word problems",
+ "samples": 17600,
+ "task_type": "generation",
+ "difficulty": "medium"
+ },
+ {
+ "id": "deepmind/aqua_rat",
+ "name": "AQUA-RAT",
+ "description": "Algebraic reasoning problems",
+ "samples": 196000,
+ "task_type": "multiple_choice",
+ "difficulty": "hard"
}
- }
-}
-
-# NovaEval Compatible Datasets
-SUPPORTED_DATASETS = {
- "mmlu": {
- "name": "MMLU",
- "full_name": "Massive Multitask Language Understanding",
- "description": "57-subject benchmark covering elementary mathematics to advanced professional topics",
- "type": "multiple_choice",
- "subsets": [
- "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
- "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
- "college_medicine", "college_physics", "computer_security", "conceptual_physics",
- "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic",
- "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science",
- "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
- "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
- "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history",
- "high_school_world_history", "human_aging", "human_sexuality", "international_law",
- "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing",
- "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition",
- "philosophy", "prehistory", "professional_accounting", "professional_law", "professional_medicine",
- "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy",
- "virology", "world_religions"
- ],
- "sample_count": 14042
- },
- "humaneval": {
- "name": "HumanEval",
- "full_name": "Human Eval Code Generation",
- "description": "Programming problems for evaluating code generation capabilities",
- "type": "code_generation",
- "subsets": ["python", "javascript", "java", "cpp"],
- "sample_count": 164
- },
- "mbpp": {
- "name": "MBPP",
- "full_name": "Mostly Basic Python Problems",
- "description": "Python programming problems for code generation evaluation",
- "type": "code_generation",
- "subsets": ["basic", "intermediate", "advanced"],
- "sample_count": 974
- },
- "hellaswag": {
- "name": "HellaSwag",
- "full_name": "HellaSwag Commonsense Reasoning",
- "description": "Commonsense reasoning about everyday situations",
- "type": "multiple_choice",
- "subsets": ["validation", "test"],
- "sample_count": 10042
- },
- "arc": {
- "name": "ARC",
- "full_name": "AI2 Reasoning Challenge",
- "description": "Science questions requiring reasoning",
- "type": "multiple_choice",
- "subsets": ["easy", "challenge"],
- "sample_count": 7787
- },
- "truthfulqa": {
- "name": "TruthfulQA",
- "full_name": "TruthfulQA",
- "description": "Questions designed to test truthfulness and avoid falsehoods",
- "type": "multiple_choice",
- "subsets": ["mc1", "mc2", "generation"],
- "sample_count": 817
- },
- "custom": {
- "name": "Custom Dataset",
- "full_name": "Upload Custom Dataset",
- "description": "Upload your own JSON or CSV dataset for evaluation",
- "type": "custom",
- "subsets": ["json", "csv"],
- "sample_count": "Variable"
- }
+ ],
+ "code": [
+ {
+ "id": "openai/openai_humaneval",
+ "name": "HumanEval",
+ "description": "Python code generation benchmark",
+ "samples": 164,
+ "task_type": "code_generation",
+ "difficulty": "hard"
+ },
+ {
+ "id": "google-research-datasets/mbpp",
+ "name": "MBPP",
+ "description": "Mostly Basic Python Problems",
+ "samples": 1400,
+ "task_type": "code_generation",
+ "difficulty": "medium"
+ }
+ ],
+ "language": [
+ {
+ "id": "stanfordnlp/imdb",
+ "name": "IMDB Reviews",
+ "description": "Movie review sentiment analysis",
+ "samples": 100000,
+ "task_type": "classification",
+ "difficulty": "easy"
+ },
+ {
+ "id": "abisee/cnn_dailymail",
+ "name": "CNN/DailyMail",
+ "description": "News article summarization",
+ "samples": 936000,
+ "task_type": "summarization",
+ "difficulty": "medium"
+ }
+ ]
}
-# NovaEval Compatible Metrics/Scorers
-SUPPORTED_METRICS = {
- "accuracy": {
+# Evaluation Metrics
+EVALUATION_METRICS = [
+ {
+ "id": "accuracy",
"name": "Accuracy",
- "category": "Accuracy-Based",
- "description": "Classification accuracy for multiple choice questions",
- "best_for": ["multiple_choice", "classification"]
+ "description": "Percentage of correct predictions",
+ "applicable_tasks": ["multiple_choice", "yes_no", "classification"]
},
- "exact_match": {
- "name": "Exact Match",
- "category": "Accuracy-Based",
- "description": "Exact string matching between prediction and reference",
- "best_for": ["short_answer", "factual"]
- },
- "f1_score": {
+ {
+ "id": "f1_score",
"name": "F1 Score",
- "category": "Accuracy-Based",
- "description": "F1 score for classification tasks",
- "best_for": ["classification", "binary_tasks"]
- },
- "semantic_similarity": {
- "name": "Semantic Similarity",
- "category": "Semantic-Based",
- "description": "Embedding-based similarity scoring",
- "best_for": ["text_generation", "paraphrasing"]
+ "description": "Harmonic mean of precision and recall",
+ "applicable_tasks": ["classification", "multiple_choice"]
},
- "bert_score": {
- "name": "BERT Score",
- "category": "Semantic-Based",
- "description": "BERT-based semantic evaluation",
- "best_for": ["text_generation", "summarization"]
+ {
+ "id": "bleu",
+ "name": "BLEU Score",
+ "description": "Bilingual Evaluation Understudy for text generation",
+ "applicable_tasks": ["generation", "summarization", "code_generation"]
},
- "rouge_score": {
+ {
+ "id": "rouge",
"name": "ROUGE Score",
- "category": "Semantic-Based",
- "description": "ROUGE metrics for text generation",
- "best_for": ["summarization", "text_generation"]
- },
- "code_execution": {
- "name": "Code Execution",
- "category": "Code-Specific",
- "description": "Execute and validate code outputs",
- "best_for": ["code_generation", "programming"]
+ "description": "Recall-Oriented Understudy for Gisting Evaluation",
+ "applicable_tasks": ["summarization", "generation"]
},
- "syntax_checker": {
- "name": "Syntax Checker",
- "category": "Code-Specific",
- "description": "Validate code syntax",
- "best_for": ["code_generation", "programming"]
- },
- "llm_judge": {
- "name": "LLM Judge",
- "category": "Custom",
- "description": "Use another LLM as a judge",
- "best_for": ["open_ended", "creative_tasks"]
+ {
+ "id": "pass_at_k",
+ "name": "Pass@K",
+ "description": "Percentage of problems solved correctly",
+ "applicable_tasks": ["code_generation"]
}
-}
+]
-async def broadcast_to_websockets(message: dict):
- """Broadcast message to all connected websockets"""
- if websocket_connections:
- disconnected = []
- for websocket in websocket_connections:
- try:
- await websocket.send_text(json.dumps(message))
- except:
- disconnected.append(websocket)
-
- # Remove disconnected websockets
- for ws in disconnected:
- websocket_connections.remove(ws)
+async def send_websocket_message(evaluation_id: str, message: dict):
+ """Send message to WebSocket connection if exists"""
+ if evaluation_id in websocket_connections:
+ try:
+ await websocket_connections[evaluation_id].send_text(json.dumps(message))
+ except Exception as e:
+ logger.error(f"Failed to send WebSocket message: {e}")
-async def simulate_novaeval_evaluation(evaluation_id: str, request: EvaluationRequest):
- """Simulate NovaEval evaluation with detailed logging"""
+async def simulate_evaluation(evaluation_id: str, request: EvaluationRequest):
+ """Simulate a real evaluation process with detailed logging"""
try:
- logger.info(f"Starting NovaEval evaluation {evaluation_id}")
+ # Initialize evaluation
+ active_evaluations[evaluation_id] = {
+ "status": "running",
+ "progress": 0,
+ "current_step": "Initializing",
+ "results": {},
+ "logs": [],
+ "start_time": datetime.now()
+ }
- # Update status
- active_evaluations[evaluation_id]["status"] = "running"
- active_evaluations[evaluation_id]["start_time"] = datetime.now().isoformat()
+ total_steps = len(request.models) * 5 # 5 steps per model
+ current_step = 0
- # Broadcast start
- await broadcast_to_websockets({
- "type": "evaluation_start",
- "evaluation_id": evaluation_id,
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
"message": f"๐ Starting NovaEval evaluation with {len(request.models)} models"
})
- # Simulate evaluation steps with detailed logging
- steps = [
- "๐ง Initializing NovaEval framework",
- "๐ Loading dataset configuration",
- "๐ค Preparing model interfaces",
- "๐ Validating evaluation parameters",
- "๐ฅ Loading evaluation samples",
- "โ๏ธ Setting up scorers and metrics",
- "๐ Starting model evaluations"
- ]
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
+ "message": f"๐ Dataset: {request.dataset} | Sample size: {request.sample_size}"
+ })
- for i, step in enumerate(steps):
- await asyncio.sleep(1)
- progress = int((i + 1) / len(steps) * 30) # 30% for setup
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
+ "message": f"๐ Metrics: {', '.join(request.metrics)}"
+ })
+
+ # Process each model
+ for model_id in request.models:
+ model_name = model_id.split('/')[-1]
- log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}"
- logger.info(log_message)
+ # Step 1: Load model
+ current_step += 1
+ await send_websocket_message(evaluation_id, {
+ "type": "progress",
+ "progress": (current_step / total_steps) * 100,
+ "current_step": f"Loading {model_name}"
+ })
- await broadcast_to_websockets({
- "type": "evaluation_progress",
- "evaluation_id": evaluation_id,
- "progress": progress,
- "message": log_message,
- "step": step
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
+ "message": f"๐ค Loading model: {model_id}"
})
-
- # Simulate model evaluations
- results = {}
- for model_idx, model in enumerate(request.models):
- model_info = None
- for provider, models in SUPPORTED_MODELS.items():
- if model in models:
- model_info = models[model]
- break
- if not model_info:
- continue
-
- await broadcast_to_websockets({
- "type": "model_start",
- "evaluation_id": evaluation_id,
- "model": model,
- "message": f"๐ค Starting evaluation for {model_info['name']}"
+ await asyncio.sleep(2) # Simulate model loading time
+
+ # Step 2: Prepare dataset
+ current_step += 1
+ await send_websocket_message(evaluation_id, {
+ "type": "progress",
+ "progress": (current_step / total_steps) * 100,
+ "current_step": f"Preparing dataset for {model_name}"
})
- # Simulate model loading and evaluation
- model_steps = [
- f"๐ฅ Loading {model_info['name']} ({model_info['provider']})",
- f"๐ง Configuring model parameters (temp={request.temperature}, max_tokens={request.max_tokens})",
- f"๐ Running evaluation on {request.num_samples} samples",
- f"๐ Computing {', '.join(request.metrics)} metrics",
- f"โ
{model_info['name']} evaluation complete"
- ]
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
+ "message": f"๐ฅ Loading dataset: {request.dataset}"
+ })
- for step_idx, step in enumerate(model_steps):
- await asyncio.sleep(2)
- base_progress = 30 + (model_idx * 60 // len(request.models))
- step_progress = base_progress + (step_idx + 1) * (60 // len(request.models)) // len(model_steps)
-
- log_message = f"[{datetime.now().strftime('%H:%M:%S')}] {step}"
- logger.info(log_message)
-
- await broadcast_to_websockets({
- "type": "evaluation_progress",
- "evaluation_id": evaluation_id,
- "progress": step_progress,
- "message": log_message,
- "model": model
+ await asyncio.sleep(1)
+
+ # Step 3: Run evaluation
+ current_step += 1
+ await send_websocket_message(evaluation_id, {
+ "type": "progress",
+ "progress": (current_step / total_steps) * 100,
+ "current_step": f"Evaluating {model_name}"
+ })
+
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
+ "message": f"๐งช Running evaluation on {request.sample_size} samples"
+ })
+
+ # Simulate processing samples
+ for i in range(0, request.sample_size, 10):
+ await asyncio.sleep(0.5)
+ processed = min(i + 10, request.sample_size)
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "DEBUG",
+ "message": f"๐ Processed {processed}/{request.sample_size} samples"
})
-
- # Simulate detailed request/response logging for the evaluation step
- if "Running evaluation" in step:
- for sample_idx in range(min(3, request.num_samples)): # Show first 3 samples
- await asyncio.sleep(1)
-
- # Simulate request
- sample_request = {
- "model": model,
- "prompt": f"Sample question {sample_idx + 1} from {request.dataset}",
- "temperature": request.temperature,
- "max_tokens": request.max_tokens,
- "top_p": request.top_p
- }
-
- # Simulate response
- sample_response = {
- "response": f"Model response for sample {sample_idx + 1}",
- "tokens_used": 45 + sample_idx * 10,
- "latency_ms": 1200 + sample_idx * 200,
- "cost_usd": model_info["cost_per_1k_tokens"] * (45 + sample_idx * 10) / 1000
- }
-
- await broadcast_to_websockets({
- "type": "request_response",
- "evaluation_id": evaluation_id,
- "model": model,
- "sample_index": sample_idx + 1,
- "request": sample_request,
- "response": sample_response,
- "message": f"๐ Sample {sample_idx + 1}/{request.num_samples}: {sample_response['latency_ms']}ms, {sample_response['tokens_used']} tokens"
- })
- # Generate realistic results
- import random
- random.seed(hash(model + request.dataset)) # Consistent results
+ # Step 4: Calculate metrics
+ current_step += 1
+ await send_websocket_message(evaluation_id, {
+ "type": "progress",
+ "progress": (current_step / total_steps) * 100,
+ "current_step": f"Calculating metrics for {model_name}"
+ })
+
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "INFO",
+ "message": f"๐ Calculating metrics: {', '.join(request.metrics)}"
+ })
+
+ await asyncio.sleep(1)
+
+ # Step 5: Generate results
+ current_step += 1
+ await send_websocket_message(evaluation_id, {
+ "type": "progress",
+ "progress": (current_step / total_steps) * 100,
+ "current_step": f"Finalizing results for {model_name}"
+ })
- model_results = {}
+ # Generate realistic results
+ results = {}
for metric in request.metrics:
if metric == "accuracy":
- score = 0.6 + random.random() * 0.35 # 60-95%
+ results[metric] = round(0.65 + (hash(model_id) % 30) / 100, 3)
elif metric == "f1_score":
- score = 0.55 + random.random() * 0.4 # 55-95%
- elif metric in ["semantic_similarity", "bert_score"]:
- score = 0.7 + random.random() * 0.25 # 70-95%
- elif metric == "exact_match":
- score = 0.4 + random.random() * 0.5 # 40-90%
- else:
- score = 0.5 + random.random() * 0.4 # 50-90%
-
- model_results[metric] = {
- "score": round(score, 3),
- "samples_evaluated": request.num_samples,
- "metric_type": SUPPORTED_METRICS[metric]["category"]
- }
+ results[metric] = round(0.60 + (hash(model_id) % 35) / 100, 3)
+ elif metric == "bleu":
+ results[metric] = round(0.25 + (hash(model_id) % 40) / 100, 3)
+ elif metric == "rouge":
+ results[metric] = round(0.30 + (hash(model_id) % 35) / 100, 3)
+ elif metric == "pass_at_k":
+ results[metric] = round(0.15 + (hash(model_id) % 50) / 100, 3)
- results[model] = {
- "model_info": model_info,
- "metrics": model_results,
- "total_tokens": request.num_samples * 50,
- "total_cost": model_info["cost_per_1k_tokens"] * request.num_samples * 50 / 1000,
- "avg_latency_ms": 1000 + random.randint(200, 800)
- }
+ active_evaluations[evaluation_id]["results"][model_id] = results
- await broadcast_to_websockets({
- "type": "model_complete",
- "evaluation_id": evaluation_id,
- "model": model,
- "results": model_results,
- "message": f"โ
{model_info['name']} completed with {len(model_results)} metrics"
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "SUCCESS",
+ "message": f"โ
{model_name} evaluation complete: {results}"
})
+
+ await asyncio.sleep(1)
- # Final processing
- await asyncio.sleep(1)
- await broadcast_to_websockets({
- "type": "evaluation_progress",
- "evaluation_id": evaluation_id,
- "progress": 95,
- "message": f"[{datetime.now().strftime('%H:%M:%S')}] ๐ Generating evaluation report"
- })
-
- await asyncio.sleep(2)
-
- # Complete evaluation
+ # Finalize evaluation
active_evaluations[evaluation_id]["status"] = "completed"
- active_evaluations[evaluation_id]["end_time"] = datetime.now().isoformat()
- active_evaluations[evaluation_id]["results"] = results
-
- # Calculate summary statistics
- total_cost = sum(r["total_cost"] for r in results.values())
- total_tokens = sum(r["total_tokens"] for r in results.values())
- avg_latency = sum(r["avg_latency_ms"] for r in results.values()) / len(results)
+ active_evaluations[evaluation_id]["progress"] = 100
+ active_evaluations[evaluation_id]["end_time"] = datetime.now()
- summary = {
- "models_evaluated": len(request.models),
- "samples_per_model": request.num_samples,
- "total_samples": len(request.models) * request.num_samples,
- "total_cost_usd": round(total_cost, 4),
- "total_tokens": total_tokens,
- "avg_latency_ms": round(avg_latency, 0),
- "dataset": request.dataset,
- "metrics": request.metrics
- }
-
- await broadcast_to_websockets({
- "type": "evaluation_complete",
- "evaluation_id": evaluation_id,
- "progress": 100,
- "results": results,
- "summary": summary,
- "message": f"๐ Evaluation complete! {len(request.models)} models evaluated on {request.num_samples} samples"
+ await send_websocket_message(evaluation_id, {
+ "type": "complete",
+ "results": active_evaluations[evaluation_id]["results"],
+ "message": "๐ Evaluation completed successfully!"
})
- logger.info(f"NovaEval evaluation {evaluation_id} completed successfully")
+ await send_websocket_message(evaluation_id, {
+ "type": "log",
+ "timestamp": datetime.now().isoformat(),
+ "level": "SUCCESS",
+ "message": "๐ฏ All evaluations completed successfully!"
+ })
except Exception as e:
- logger.error(f"Error in evaluation {evaluation_id}: {str(e)}")
+ logger.error(f"Evaluation failed: {e}")
active_evaluations[evaluation_id]["status"] = "failed"
active_evaluations[evaluation_id]["error"] = str(e)
- await broadcast_to_websockets({
- "type": "evaluation_error",
- "evaluation_id": evaluation_id,
- "error": str(e),
+ await send_websocket_message(evaluation_id, {
+ "type": "error",
"message": f"โ Evaluation failed: {str(e)}"
})
+# API Endpoints
@app.get("/", response_class=HTMLResponse)
async def get_homepage():
- """Serve the comprehensive NovaEval interface"""
- return HTMLResponse(content=f"""
+ """Serve the main application interface"""
+ return """
- NovaEval by Noveum.ai - Advanced AI Model Evaluation Platform
+ NovaEval by Noveum.ai - Advanced AI Model Evaluation
-
-
+
-
-
+
+
-
-
๐งช
+
+
+
+
-
NovaEval
-
Advanced AI Model Evaluation Platform
+
NovaEval
+
by Noveum.ai
-
- โก Powered by Noveum.ai
-
-
- Visit Noveum.ai โ
-
+
Advanced AI Model Evaluation Platform
+
Powered by Hugging Face Models
-
-
-
-
-
-
๐ค
-
Latest LLMs
-
Evaluate cutting-edge language models from OpenAI, Anthropic, AWS Bedrock, and Noveum.ai
-
-
-
- GPT-4o, Claude 3.5 Sonnet
-
-
-
- Real-time model search
-
-
-
- Cost and performance metrics
-
-
-
-
-
-
๐
-
Comprehensive Datasets
-
Test models on academic benchmarks, code generation, and custom datasets
-
-
-
- MMLU, HumanEval, HellaSwag
-
-
-
- Custom dataset upload
-
-
-
- Configurable sample sizes
-
-
-
-
-
-
โก
-
Advanced Analytics
-
Real-time evaluation logs, detailed metrics, and interactive visualizations
-
-
-
- Live request/response logs
-
-
-
- Multiple scoring metrics
+
+
+
+
+
+
+
+
+
+
Select Models
-
-
- Export results (JSON, CSV)
+
+
+
-
-
-
-
-
-
-
-
๐ Start New Evaluation
-
- Powered by NovaEval v0.3.3
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Select Models (max 5)
-
-
-
+
+
+
+
+
+
+
+
-
+
+
-
-
Selected Models:
-
-
-
+
+ 0 models selected
-
-
-
-
-
Choose Evaluation Dataset
+
+
+
+
+
Select Dataset
+
-
-
+
+
+
+
+
+
+
+
+
+
-
-
Select Subset:
-
+
+
+
-
-
-
-
-
Select Evaluation Metrics
-
-
-
+
+
+
+
+
Evaluation Configuration
-
-
-
-
-
-
Evaluation Configuration
-
-
+
+
@@ -876,142 +654,84 @@ async def get_homepage():
class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
0.0
- 0.7
+ 0.7
2.0
-
-
- 50
- 150
- 500
-
-
-
-
-
-
-
-
-
- 0.1
- 1.0
- 1.0
-
-
-
-
-
-
- 0.0
- 0.0
- 2.0
-
-
-
-
-
-
-
- 0.0
- 0.0
- 2.0
+ 128
+ 512
+ 2048
-
-
๐ก Configuration Tips
-
- - โข Lower temperature (0.0-0.3) for factual tasks, higher (0.7-1.0) for creative tasks
- - โข Start with 10-20 samples for quick testing, use 50+ for reliable results
- - โข Max tokens should match expected response length
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
๐ Evaluation Progress
-
- Initializing...
+
+
+
+
-
-
-
-
Progress
-
0%
+
+
+
+
+ Initializing...
+ 0%
+
+
+
-
-
+
+
+
+
Ready to start evaluation
-
-
-
-
๐ Live Evaluation Logs
-
-
-
+
+
+
+
+
+
Live Logs
-
-
๐ Request/Response Details
-
-
-
+
+
Waiting for evaluation to start...
-
-
-
-
-
-
-
๐ Evaluation Results
-
-
-
+
+
-
-
-
@@ -1021,591 +741,429 @@ async def get_homepage():
// Global state
let selectedModels = [];
let selectedDataset = null;
- let selectedDatasetSubset = null;
let selectedMetrics = [];
- let currentTab = 'models';
- let evaluationResults = null;
let websocket = null;
+ let currentEvaluationId = null;
- // Data from backend
- const MODELS = {json.dumps(SUPPORTED_MODELS)};
- const DATASETS = {json.dumps(SUPPORTED_DATASETS)};
- const METRICS = {json.dumps(SUPPORTED_METRICS)};
+ // Models data
+ const models = """ + json.dumps(HF_MODELS) + """;
+ const datasets = """ + json.dumps(EVALUATION_DATASETS) + """;
+ const metrics = """ + json.dumps(EVALUATION_METRICS) + """;
// Initialize the application
- document.addEventListener('DOMContentLoaded', function() {{
- populateModels();
- populateDatasets();
- populateMetrics();
- setupSliders();
- setupSearch();
- }});
+ document.addEventListener('DOMContentLoaded', function() {
+ lucide.createIcons();
+ renderModels();
+ renderDatasets();
+ renderMetrics();
+ setupEventListeners();
+ });
- function populateModels() {{
- const grid = document.getElementById('models-grid');
- grid.innerHTML = '';
+ function setupEventListeners() {
+ // Sample size slider
+ document.getElementById('sampleSize').addEventListener('input', function() {
+ document.getElementById('sampleSizeValue').textContent = this.value;
+ });
- Object.entries(MODELS).forEach(([provider, models]) => {{
- Object.entries(models).forEach(([modelId, model]) => {{
- const card = document.createElement('div');
- card.className = 'model-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
- card.dataset.provider = provider;
- card.dataset.modelId = modelId;
-
- card.innerHTML = `
-
-
${{model.name}}
- ${{model.provider}}
-
-
${{model.description}}
-
-
Context: ${{model.context_length}} tokens
-
Cost: $$${{model.cost_per_1k_tokens}}/1K tokens
-
- ${{model.capabilities.map(cap => `${{cap}}`).join('')}}
-
-
- `;
-
- card.addEventListener('click', () => toggleModel(modelId, model));
- grid.appendChild(card);
- }});
- }});
- }}
+ // Temperature slider
+ document.getElementById('temperature').addEventListener('input', function() {
+ document.getElementById('temperatureValue').textContent = this.value;
+ });
+
+ // Max tokens slider
+ document.getElementById('maxTokens').addEventListener('input', function() {
+ document.getElementById('maxTokensValue').textContent = this.value;
+ });
+
+ // Model search
+ document.getElementById('modelSearch').addEventListener('input', function() {
+ const searchTerm = this.value.toLowerCase();
+ filterModelsBySearch(searchTerm);
+ });
+ }
- function populateDatasets() {{
- const grid = document.getElementById('datasets-grid');
+ function renderModels() {
+ const grid = document.getElementById('modelGrid');
grid.innerHTML = '';
- Object.entries(DATASETS).forEach(([datasetId, dataset]) => {{
- const card = document.createElement('div');
- card.className = 'dataset-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
- card.dataset.datasetId = datasetId;
-
- card.innerHTML = `
-
-
${{dataset.name}}
- ${{dataset.type}}
-
-
${{dataset.description}}
-
-
Samples: ${{dataset.sample_count}}
-
Subsets: ${{Array.isArray(dataset.subsets) ? dataset.subsets.length : 'N/A'}}
-
- `;
-
- card.addEventListener('click', () => selectDataset(datasetId, dataset));
- grid.appendChild(card);
- }});
- }}
+ Object.keys(models).forEach(category => {
+ models[category].forEach(model => {
+ const modelCard = createModelCard(model, category);
+ grid.appendChild(modelCard);
+ });
+ });
+ }
- function populateMetrics() {{
- const grid = document.getElementById('metrics-grid');
- grid.innerHTML = '';
+ function createModelCard(model, category) {
+ const div = document.createElement('div');
+ div.className = `model-card p-4 border rounded-lg cursor-pointer hover:shadow-md transition-all`;
+ div.dataset.category = category;
+ div.dataset.modelId = model.id;
- Object.entries(METRICS).forEach(([metricId, metric]) => {{
- const card = document.createElement('div');
- card.className = 'metric-card bg-gray-50 border-2 border-gray-200 rounded-lg p-4';
- card.dataset.metricId = metricId;
-
- card.innerHTML = `
-
-
${{metric.name}}
-
${{metric.category}}
+ div.innerHTML = `
+
+
+
${model.name}
+
${model.provider} โข ${model.size}
-
${{metric.description}}
-
- Best for: ${{metric.best_for.join(', ')}}
-
- `;
-
- card.addEventListener('click', () => toggleMetric(metricId, metric));
- grid.appendChild(card);
- }});
- }}
-
- function setupSliders() {{
- const sliders = ['num-samples', 'temperature', 'max-tokens', 'top-p', 'frequency-penalty', 'presence-penalty'];
+
${model.size}
+
+
${model.description}
+
+ ${model.capabilities.map(cap => `${cap}`).join('')}
+
+ `;
- sliders.forEach(sliderId => {{
- const slider = document.getElementById(sliderId);
- const valueDisplay = document.getElementById(sliderId + '-value');
-
- slider.addEventListener('input', function() {{
- valueDisplay.textContent = this.value;
- }});
- }});
- }}
+ div.addEventListener('click', () => toggleModelSelection(model.id, div));
+ return div;
+ }
- function setupSearch() {{
- const searchInput = document.getElementById('model-search');
- const providerFilter = document.getElementById('provider-filter');
-
- function filterModels() {{
- const searchTerm = searchInput.value.toLowerCase();
- const selectedProvider = providerFilter.value;
- const modelCards = document.querySelectorAll('.model-card');
-
- modelCards.forEach(card => {{
- const modelName = card.querySelector('h4').textContent.toLowerCase();
- const provider = card.dataset.provider;
-
- const matchesSearch = modelName.includes(searchTerm);
- const matchesProvider = !selectedProvider || provider === selectedProvider;
-
- card.style.display = matchesSearch && matchesProvider ? 'block' : 'none';
- }});
- }}
-
- searchInput.addEventListener('input', filterModels);
- providerFilter.addEventListener('change', filterModels);
- }}
+ function toggleModelSelection(modelId, element) {
+ if (selectedModels.includes(modelId)) {
+ selectedModels = selectedModels.filter(id => id !== modelId);
+ element.classList.remove('selected');
+ } else {
+ selectedModels.push(modelId);
+ element.classList.add('selected');
+ }
+ updateSelectedModelsCount();
+ }
- function toggleModel(modelId, model) {{
- const index = selectedModels.findIndex(m => m.id === modelId);
-
- if (index > -1) {{
- selectedModels.splice(index, 1);
- }} else if (selectedModels.length < 5) {{
- selectedModels.push({{id: modelId, ...model}});
- }} else {{
- alert('Maximum 5 models can be selected');
- return;
- }}
-
- updateModelSelection();
- }}
+ function updateSelectedModelsCount() {
+ document.getElementById('selectedModelsCount').textContent = selectedModels.length;
+ }
- function updateModelSelection() {{
- // Update visual selection
- document.querySelectorAll('.model-card').forEach(card => {{
- const modelId = card.dataset.modelId;
- if (selectedModels.some(m => m.id === modelId)) {{
- card.classList.add('selected');
- }} else {{
- card.classList.remove('selected');
- }}
- }});
+ function filterModels(category) {
+ // Update filter buttons
+ document.querySelectorAll('[id^="filter-"]').forEach(btn => {
+ btn.className = btn.className.replace('bg-purple-600 text-white', 'bg-gray-200 text-gray-700');
+ });
+ document.getElementById(`filter-${category}`).className =
+ document.getElementById(`filter-${category}`).className.replace('bg-gray-200 text-gray-700', 'bg-purple-600 text-white');
- // Update selected models list
- const list = document.getElementById('selected-models-list');
- list.innerHTML = selectedModels.map(model =>
- `
${{model.name}}`
- ).join('');
- }}
+ // Filter model cards
+ document.querySelectorAll('.model-card').forEach(card => {
+ if (category === 'all' || card.dataset.category === category) {
+ card.style.display = 'block';
+ } else {
+ card.style.display = 'none';
+ }
+ });
+ }
- function selectDataset(datasetId, dataset) {{
- selectedDataset = datasetId;
-
- // Update visual selection
- document.querySelectorAll('.dataset-card').forEach(card => {{
- if (card.dataset.datasetId === datasetId) {{
- card.classList.add('selected');
- }} else {{
- card.classList.remove('selected');
- }}
- }});
-
- // Show subsets if available
- if (dataset.subsets && dataset.subsets.length > 0) {{
- const subsetsDiv = document.getElementById('dataset-subsets');
- const select = document.getElementById('subset-select');
+ function filterModelsBySearch(searchTerm) {
+ document.querySelectorAll('.model-card').forEach(card => {
+ const modelName = card.querySelector('h3').textContent.toLowerCase();
+ const modelProvider = card.querySelector('p').textContent.toLowerCase();
- select.innerHTML = '
' +
- dataset.subsets.map(subset => `
`).join('');
-
- subsetsDiv.classList.remove('hidden');
-
- select.addEventListener('change', function() {{
- selectedDatasetSubset = this.value || null;
- }});
- }} else {{
- document.getElementById('dataset-subsets').classList.add('hidden');
- selectedDatasetSubset = null;
- }}
- }}
-
- function toggleMetric(metricId, metric) {{
- const index = selectedMetrics.findIndex(m => m.id === metricId);
-
- if (index > -1) {{
- selectedMetrics.splice(index, 1);
- }} else {{
- selectedMetrics.push({{id: metricId, ...metric}});
- }}
-
- updateMetricSelection();
- }}
+ if (modelName.includes(searchTerm) || modelProvider.includes(searchTerm)) {
+ card.style.display = 'block';
+ } else {
+ card.style.display = 'none';
+ }
+ });
+ }
- function updateMetricSelection() {{
- // Update visual selection
- document.querySelectorAll('.metric-card').forEach(card => {{
- const metricId = card.dataset.metricId;
- if (selectedMetrics.some(m => m.id === metricId)) {{
- card.classList.add('selected');
- }} else {{
- card.classList.remove('selected');
- }}
- }});
+ function renderDatasets() {
+ const grid = document.getElementById('datasetGrid');
+ grid.innerHTML = '';
- // Update selected metrics list
- const list = document.getElementById('selected-metrics-list');
- list.innerHTML = selectedMetrics.map(metric =>
- `
${{metric.name}}`
- ).join('');
- }}
+ Object.keys(datasets).forEach(category => {
+ datasets[category].forEach(dataset => {
+ const datasetCard = createDatasetCard(dataset, category);
+ grid.appendChild(datasetCard);
+ });
+ });
+ }
- function switchTab(tabName) {{
- currentTab = tabName;
-
- // Hide all tabs
- document.querySelectorAll('.tab-content').forEach(tab => {{
- tab.classList.add('hidden');
- }});
-
- // Show selected tab
- document.getElementById(tabName + '-tab').classList.remove('hidden');
+ function createDatasetCard(dataset, category) {
+ const div = document.createElement('div');
+ div.className = `dataset-card p-3 border rounded-lg cursor-pointer hover:shadow-md transition-all`;
+ div.dataset.category = category;
+ div.dataset.datasetId = dataset.id;
- // Update tab buttons
- document.querySelectorAll('.tab-button').forEach(btn => {{
- btn.classList.remove('active');
- }});
- event.target.classList.add('active');
+ div.innerHTML = `
+
+
+
${dataset.name}
+
${dataset.description}
+
+
${dataset.samples.toLocaleString()}
+
+
+ ${dataset.task_type}
+ ${dataset.difficulty}
+
+ `;
- updateNavigationButtons();
- }}
+ div.addEventListener('click', () => selectDataset(dataset.id, div));
+ return div;
+ }
- function nextTab() {{
- const tabs = ['models', 'datasets', 'metrics', 'config'];
- const currentIndex = tabs.indexOf(currentTab);
+ function selectDataset(datasetId, element) {
+ // Remove previous selection
+ document.querySelectorAll('.dataset-card').forEach(card => {
+ card.classList.remove('selected');
+ });
- if (currentIndex < tabs.length - 1) {{
- const nextTab = tabs[currentIndex + 1];
- document.querySelector(`[onclick="switchTab('${{nextTab}}')"]`).click();
- }}
- }}
+ // Add selection to clicked element
+ element.classList.add('selected');
+ selectedDataset = datasetId;
+ }
- function previousTab() {{
- const tabs = ['models', 'datasets', 'metrics', 'config'];
- const currentIndex = tabs.indexOf(currentTab);
+ function filterDatasets(category) {
+ // Update filter buttons
+ document.querySelectorAll('[id^="dataset-filter-"]').forEach(btn => {
+ btn.className = btn.className.replace('bg-purple-600 text-white', 'bg-gray-200 text-gray-700');
+ });
+ document.getElementById(`dataset-filter-${category}`).className =
+ document.getElementById(`dataset-filter-${category}`).className.replace('bg-gray-200 text-gray-700', 'bg-purple-600 text-white');
- if (currentIndex > 0) {{
- const prevTab = tabs[currentIndex - 1];
- document.querySelector(`[onclick="switchTab('${{prevTab}}')"]`).click();
- }}
- }}
+ // Filter dataset cards
+ document.querySelectorAll('.dataset-card').forEach(card => {
+ if (category === 'all' || card.dataset.category === category) {
+ card.style.display = 'block';
+ } else {
+ card.style.display = 'none';
+ }
+ });
+ }
- function updateNavigationButtons() {{
- const prevBtn = document.getElementById('prev-tab');
- const nextBtn = document.getElementById('next-tab');
- const startBtn = document.getElementById('start-evaluation');
-
- prevBtn.style.display = currentTab === 'models' ? 'none' : 'block';
+ function renderMetrics() {
+ const grid = document.getElementById('metricsGrid');
+ grid.innerHTML = '';
- if (currentTab === 'config') {{
- nextBtn.classList.add('hidden');
- startBtn.classList.remove('hidden');
- }} else {{
- nextBtn.classList.remove('hidden');
- startBtn.classList.add('hidden');
- }}
- }}
+ metrics.forEach(metric => {
+ const div = document.createElement('div');
+ div.className = 'flex items-center space-x-2';
+
+ div.innerHTML = `
+
+
+ `;
+
+ const checkbox = div.querySelector('input');
+ checkbox.addEventListener('change', () => {
+ if (checkbox.checked) {
+ selectedMetrics.push(metric.id);
+ } else {
+ selectedMetrics = selectedMetrics.filter(id => id !== metric.id);
+ }
+ });
+
+ grid.appendChild(div);
+ });
+ }
- function startEvaluation() {{
- // Validate selections
- if (selectedModels.length === 0) {{
+ function startEvaluation() {
+ // Validation
+ if (selectedModels.length === 0) {
alert('Please select at least one model');
return;
- }}
+ }
- if (!selectedDataset) {{
+ if (!selectedDataset) {
alert('Please select a dataset');
return;
- }}
+ }
- if (selectedMetrics.length === 0) {{
+ if (selectedMetrics.length === 0) {
alert('Please select at least one metric');
return;
- }}
+ }
- // Prepare evaluation request
- const request = {{
- models: selectedModels.map(m => m.id),
+ // Prepare request
+ const request = {
+ models: selectedModels,
dataset: selectedDataset,
- dataset_subset: selectedDatasetSubset,
- metrics: selectedMetrics.map(m => m.id),
- num_samples: parseInt(document.getElementById('num-samples').value),
+ metrics: selectedMetrics,
+ sample_size: parseInt(document.getElementById('sampleSize').value),
temperature: parseFloat(document.getElementById('temperature').value),
- max_tokens: parseInt(document.getElementById('max-tokens').value),
- top_p: parseFloat(document.getElementById('top-p').value),
- frequency_penalty: parseFloat(document.getElementById('frequency-penalty').value),
- presence_penalty: parseFloat(document.getElementById('presence-penalty').value)
- }};
-
- // Show evaluation section
- document.getElementById('evaluation-section').classList.remove('hidden');
- document.getElementById('results-section').classList.add('hidden');
-
- // Scroll to evaluation section
- document.getElementById('evaluation-section').scrollIntoView({{ behavior: 'smooth' }});
+ max_tokens: parseInt(document.getElementById('maxTokens').value),
+ top_p: 0.9
+ };
// Start evaluation
- fetch('/api/evaluate', {{
+ fetch('/api/evaluate', {
method: 'POST',
- headers: {{
+ headers: {
'Content-Type': 'application/json'
- }},
+ },
body: JSON.stringify(request)
- }})
+ })
.then(response => response.json())
- .then(data => {{
- if (data.status === 'started') {{
+ .then(data => {
+ if (data.status === 'started') {
+ currentEvaluationId = data.evaluation_id;
connectWebSocket(data.evaluation_id);
- }} else {{
+ showProgress();
+ disableStartButton();
+ } else {
alert('Failed to start evaluation: ' + data.message);
- }}
- }})
- .catch(error => {{
+ }
+ })
+ .catch(error => {
console.error('Error:', error);
alert('Failed to start evaluation');
- }});
- }}
+ });
+ }
- function connectWebSocket(evaluationId) {{
+ function connectWebSocket(evaluationId) {
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
- const wsUrl = `${{protocol}}//${{window.location.host}}/ws/${{evaluationId}}`;
+ const wsUrl = `${protocol}//${window.location.host}/ws/${evaluationId}`;
websocket = new WebSocket(wsUrl);
- websocket.onmessage = function(event) {{
+ websocket.onmessage = function(event) {
const data = JSON.parse(event.data);
handleWebSocketMessage(data);
- }};
+ };
- websocket.onclose = function() {{
+ websocket.onclose = function() {
console.log('WebSocket connection closed');
- }};
+ };
- websocket.onerror = function(error) {{
+ websocket.onerror = function(error) {
console.error('WebSocket error:', error);
- }};
- }}
+ };
+ }
- function handleWebSocketMessage(data) {{
- const logsContainer = document.getElementById('evaluation-logs');
- const requestResponseContainer = document.getElementById('request-response-logs');
-
- switch (data.type) {{
- case 'evaluation_start':
- case 'evaluation_progress':
- case 'model_start':
- case 'model_complete':
- // Update progress
- if (data.progress !== undefined) {{
- document.getElementById('progress-bar').style.width = data.progress + '%';
- document.getElementById('progress-percentage').textContent = data.progress + '%';
- }}
-
- // Add log entry
- const logEntry = document.createElement('div');
- logEntry.className = 'log-entry mb-1';
- logEntry.textContent = data.message;
- logsContainer.appendChild(logEntry);
- logsContainer.scrollTop = logsContainer.scrollHeight;
+ function handleWebSocketMessage(data) {
+ switch (data.type) {
+ case 'progress':
+ updateProgress(data.progress, data.current_step);
break;
-
- case 'request_response':
- // Add request/response details
- const reqResEntry = document.createElement('div');
- reqResEntry.className = 'mb-4 p-3 bg-white rounded border';
- reqResEntry.innerHTML = `
-
${{data.model}} - Sample ${{data.sample_index}}
-
-
-
Request:
-
${{JSON.stringify(data.request, null, 2)}}
-
-
-
Response:
-
${{JSON.stringify(data.response, null, 2)}}
-
-
- `;
- requestResponseContainer.appendChild(reqResEntry);
- requestResponseContainer.scrollTop = requestResponseContainer.scrollHeight;
+ case 'log':
+ addLogEntry(data);
break;
-
- case 'evaluation_complete':
- // Update progress to 100%
- document.getElementById('progress-bar').style.width = '100%';
- document.getElementById('progress-percentage').textContent = '100%';
- document.getElementById('evaluation-status').textContent = 'Completed';
- document.getElementById('evaluation-status').className = 'px-4 py-2 bg-green-100 text-green-800 rounded-lg font-medium';
-
- // Show results
- evaluationResults = data;
- displayResults(data.results, data.summary);
+ case 'complete':
+ showResults(data.results);
+ enableStartButton();
break;
-
- case 'evaluation_error':
- document.getElementById('evaluation-status').textContent = 'Failed';
- document.getElementById('evaluation-status').className = 'px-4 py-2 bg-red-100 text-red-800 rounded-lg font-medium';
-
- const errorEntry = document.createElement('div');
- errorEntry.className = 'log-entry mb-1 text-red-400';
- errorEntry.textContent = data.message;
- logsContainer.appendChild(errorEntry);
+ case 'error':
+ addLogEntry({
+ level: 'ERROR',
+ message: data.message,
+ timestamp: new Date().toISOString()
+ });
+ enableStartButton();
break;
- }}
- }}
+ }
+ }
- function displayResults(results, summary) {{
- document.getElementById('results-section').classList.remove('hidden');
- document.getElementById('results-section').scrollIntoView({{ behavior: 'smooth' }});
-
- const container = document.getElementById('results-content');
-
- // Summary section
- const summaryHtml = `
-
-
-
${{summary.models_evaluated}}
-
Models Evaluated
-
-
-
${{summary.total_samples}}
-
Total Samples
-
-
-
$${{summary.total_cost_usd}}
-
Total Cost
-
-
-
${{summary.avg_latency_ms}}ms
-
Avg Latency
-
-
- `;
+ function showProgress() {
+ document.getElementById('idleMessage').classList.add('hidden');
+ document.getElementById('progressSection').classList.remove('hidden');
+ clearLogs();
+ }
+
+ function updateProgress(progress, currentStep) {
+ document.getElementById('progressBar').style.width = progress + '%';
+ document.getElementById('progressPercent').textContent = Math.round(progress) + '%';
+ document.getElementById('currentStep').textContent = currentStep;
+ }
+
+ function addLogEntry(logData) {
+ const container = document.getElementById('logsContainer');
+ const entry = document.createElement('div');
+ entry.className = 'log-entry mb-1';
- // Results table
- const modelsArray = Object.entries(results);
- const metricsArray = Object.keys(modelsArray[0][1].metrics);
+ const timestamp = new Date(logData.timestamp).toLocaleTimeString();
+ const levelColor = {
+ 'INFO': 'text-blue-400',
+ 'SUCCESS': 'text-green-400',
+ 'ERROR': 'text-red-400',
+ 'DEBUG': 'text-gray-400'
+ }[logData.level] || 'text-green-400';
- const tableHtml = `
-
-
-
-
- Model |
- ${{metricsArray.map(metric => `${{METRICS[metric].name}} | `).join('')}}
- Cost |
- Latency |
-
-
-
- ${{modelsArray.map(([modelId, result]) => `
-
- ${{result.model_info.name}} |
- ${{metricsArray.map(metric => `
-
- ${{result.metrics[metric].score}}
- |
- `).join('')}}
- $${{result.total_cost.toFixed(4)}} |
- ${{result.avg_latency_ms}}ms |
-
- `).join('')}}
-
-
-
+ entry.innerHTML = `
+
[${timestamp}]
+
[${logData.level}]
+
${logData.message}
`;
- container.innerHTML = summaryHtml + tableHtml;
- }}
+ container.appendChild(entry);
+ container.scrollTop = container.scrollHeight;
+ }
- function exportResults(format) {{
- if (!evaluationResults) {{
- alert('No results to export');
- return;
- }}
+ function clearLogs() {
+ document.getElementById('logsContainer').innerHTML = '';
+ }
+
+ function showResults(results) {
+ const section = document.getElementById('resultsSection');
+ const content = document.getElementById('resultsContent');
- let content, filename, mimeType;
+ let html = '
';
- if (format === 'json') {{
- content = JSON.stringify(evaluationResults, null, 2);
- filename = 'novaeval_results.json';
- mimeType = 'application/json';
- }} else if (format === 'csv') {{
- // Convert to CSV
- const results = evaluationResults.results;
- const headers = ['Model', 'Provider'];
- const metricsArray = Object.keys(Object.values(results)[0].metrics);
- headers.push(...metricsArray, 'Total Cost', 'Avg Latency (ms)');
+ Object.keys(results).forEach(modelId => {
+ const modelName = modelId.split('/').pop();
+ const modelResults = results[modelId];
- const rows = [headers];
- Object.entries(results).forEach(([modelId, result]) => {{
- const row = [
- result.model_info.name,
- result.model_info.provider,
- ...metricsArray.map(metric => result.metrics[metric].score),
- result.total_cost.toFixed(4),
- result.avg_latency_ms
- ];
- rows.push(row);
- }});
+ html += `
+
+
${modelName}
+
+ `;
+
+ Object.keys(modelResults).forEach(metric => {
+ const value = modelResults[metric];
+ html += `
+
+
${metric.toUpperCase()}
+
${value}
+
+ `;
+ });
- content = rows.map(row => row.join(',')).join('\\n');
- filename = 'novaeval_results.csv';
- mimeType = 'text/csv';
- }}
+ html += '
';
+ });
- const blob = new Blob([content], {{ type: mimeType }});
- const url = URL.createObjectURL(blob);
- const a = document.createElement('a');
- a.href = url;
- a.download = filename;
- document.body.appendChild(a);
- a.click();
- document.body.removeChild(a);
- URL.revokeObjectURL(url);
- }}
+ html += '
';
+ content.innerHTML = html;
+ section.classList.remove('hidden');
+ }
+
+ function disableStartButton() {
+ const btn = document.getElementById('startBtn');
+ btn.disabled = true;
+ btn.innerHTML = '
Running Evaluation...';
+ lucide.createIcons();
+ }
+
+ function enableStartButton() {
+ const btn = document.getElementById('startBtn');
+ btn.disabled = false;
+ btn.innerHTML = '
Start Evaluation';
+ lucide.createIcons();
+ }
- """)
+ """
@app.get("/api/models")
async def get_models():
- """Get all supported models"""
- return SUPPORTED_MODELS
+ """Get available models"""
+ return {"models": HF_MODELS}
@app.get("/api/datasets")
async def get_datasets():
- """Get all supported datasets"""
- return SUPPORTED_DATASETS
+ """Get available datasets"""
+ return {"datasets": EVALUATION_DATASETS}
@app.get("/api/metrics")
async def get_metrics():
- """Get all supported metrics"""
- return SUPPORTED_METRICS
+ """Get available metrics"""
+ return {"metrics": EVALUATION_METRICS}
@app.post("/api/evaluate")
async def start_evaluation(request: EvaluationRequest):
"""Start a new evaluation"""
evaluation_id = str(uuid.uuid4())
- # Store evaluation info
- active_evaluations[evaluation_id] = {
- "id": evaluation_id,
- "request": request.dict(),
- "status": "starting",
- "created_at": datetime.now().isoformat()
- }
-
# Start evaluation in background
- asyncio.create_task(simulate_novaeval_evaluation(evaluation_id, request))
-
- logger.info(f"Started evaluation {evaluation_id} with {len(request.models)} models")
+ asyncio.create_task(simulate_evaluation(evaluation_id, request))
return EvaluationResponse(
evaluation_id=evaluation_id,
@@ -1613,9 +1171,9 @@ async def start_evaluation(request: EvaluationRequest):
message="Evaluation started successfully"
)
-@app.get("/api/evaluations/{evaluation_id}")
-async def get_evaluation(evaluation_id: str):
- """Get evaluation status and results"""
+@app.get("/api/evaluation/{evaluation_id}")
+async def get_evaluation_status(evaluation_id: str):
+ """Get evaluation status"""
if evaluation_id not in active_evaluations:
raise HTTPException(status_code=404, detail="Evaluation not found")
@@ -1623,29 +1181,23 @@ async def get_evaluation(evaluation_id: str):
@app.websocket("/ws/{evaluation_id}")
async def websocket_endpoint(websocket: WebSocket, evaluation_id: str):
- """WebSocket endpoint for real-time evaluation updates"""
+ """WebSocket endpoint for real-time updates"""
await websocket.accept()
- websocket_connections.append(websocket)
+ websocket_connections[evaluation_id] = websocket
try:
while True:
# Keep connection alive
await asyncio.sleep(1)
except WebSocketDisconnect:
- websocket_connections.remove(websocket)
+ if evaluation_id in websocket_connections:
+ del websocket_connections[evaluation_id]
@app.get("/api/health")
async def health_check():
"""Health check endpoint"""
- return {
- "status": "healthy",
- "timestamp": datetime.now().isoformat(),
- "version": "1.0.0",
- "active_evaluations": len(active_evaluations)
- }
+ return {"status": "healthy", "timestamp": datetime.now().isoformat()}
if __name__ == "__main__":
- port = int(os.getenv("PORT", 7860))
- logger.info(f"Starting Comprehensive NovaEval Space on port {port}")
- uvicorn.run(app, host="0.0.0.0", port=port, reload=False)
+ uvicorn.run(app, host="0.0.0.0", port=7860)