import pandas as pd
import json
from datetime import datetime

def process_csv_to_json():
    # Read the CSV file
    df = pd.read_csv('src/record.csv')
    
    # Clean the data: remove empty rows, rename columns
    df = df.dropna(how='all')
    df = df.rename(columns={
        'dataset': 'Dataset',
        'llm': 'LLM',
        'score\n（EM）': 'Score',
        'pass rate': 'Pass rate',
        'Cost($)': 'Cost($)',
        'Eval Date': 'Eval Date',
        'framework': 'Framework',
        'X-shot': 'X-shot',
        'Nums': 'Samples',
        'All tokens': 'All tokens',
        'Total input tokens': 'Total input tokens',
        'Average input tokens': 'Average input tokens',
        'Total output tokens': 'Total output tokens',
        'Average output tokens': 'Average output tokens'
    })
    
    # Helper function: handle number strings with commas
    def parse_number(value):
        if pd.isna(value) or value == '-':
            return 0
        # Remove commas, convert to float, then to int
        return int(float(str(value).replace(',', '')))
    
    # Initialize result dictionary
    result = {
        "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "results": {}
    }
    
    # Get all unique LLMs
    llms = df['LLM'].dropna().unique()
    
    # Iterate through each algorithm
    for algorithm in df['Algorithm'].dropna().unique():
        if not isinstance(algorithm, str):
            continue
            
        result['results'][algorithm] = {}
        
        # Process each LLM
        for llm in llms:
            llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
            if llm_data.empty:
                continue
                
            # Create dictionary for each LLM
            result['results'][algorithm][llm] = {
                'META': {
                    'Algorithm': str(algorithm),
                    'LLM': str(llm),
                    'Eval Date': str(llm_data['Eval Date'].iloc[0])
                }
            }
            
            # Process each dataset
            for dataset in df['Dataset'].dropna().unique():
                if not isinstance(dataset, str):
                    continue
                    
                dataset_data = llm_data[llm_data['Dataset'] == dataset]
                
                if not dataset_data.empty:
                    data_row = dataset_data.iloc[0]
                    result['results'][algorithm][llm][dataset] = {
                        'Score': round(float(data_row['Score']) if data_row['Score'] != '-' else 0, 2),  # Keep two decimal places
                        'Pass rate': round(float(data_row['Pass rate']) / 100, 4) if data_row['Pass rate'] != '-' else 0.0,  # Convert to decimal and keep two decimal places
                        'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) and data_row['Cost($)'] != '-' else 0.0,
                        'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
                        'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
                        'Samples': parse_number(data_row['Samples']),
                        'All tokens': parse_number(data_row['All tokens']),
                        'Total input tokens': parse_number(data_row['Total input tokens']),
                        'Average input tokens': parse_number(data_row['Average input tokens']),
                        'Total output tokens': parse_number(data_row['Total output tokens']),
                        'Average output tokens': parse_number(data_row['Average output tokens'])
                    }
    
    # Check if each field exists
    required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
    
    for key, value in result['results'].items():
        for llm, datasets in value.items():
            # Check META information
            meta = datasets.get('META', {})
            if 'LLM' not in meta or 'Eval Date' not in meta:
                print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
            
            for dataset, data in datasets.items():
                if dataset == 'META':
                    continue
                missing_fields = [field for field in required_fields if field not in data]
                if missing_fields:
                    print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
    
    # Save as JSON file
    with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)

def process_csv_to_overall_json():
    # Read the CSV file
    df = pd.read_csv('src/record.csv')
    
    # Clean the data: remove empty rows, rename columns
    df = df.dropna(how='all')
    df = df.rename(columns={
        'dataset': 'Dataset',
        'llm': 'LLM',
        'score\n（EM）': 'Score',
        'Cost($)': 'Cost($)',
        'Eval Date': 'Eval Date'
    })
    
    # Initialize result dictionary
    result = {
        "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "results": {}
    }
    
    # Get all unique LLMs
    llms = df['LLM'].dropna().unique()
    for llm in llms:
        # Process base algorithms
        for algorithm in df['Algorithm'].dropna().unique():
            if not isinstance(algorithm, str):
                continue
                
            # Add suffix for non-gpt-3.5-turbo models
            # Modification: add more information for llama models to ensure uniqueness
            algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
            # Check if the algorithm-LLM combination exists
            algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
            if algo_data.empty:
                print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
                continue
                
            result['results'][algo_key] = {
                "META": {
                    "Algorithm": algorithm,
                    "LLM": llm,
                    "Eval Date": str(algo_data['Eval Date'].iloc[0])
                }
            }
            
            # Process each dataset
            for dataset in ['gsm8k', 'AQuA', 'MATH-500']:
                dataset_data = df[(df['Algorithm'] == algorithm) & 
                                (df['Dataset'] == dataset) &
                                (df['LLM'] == llm)]
                if not dataset_data.empty:
                    result['results'][algo_key][dataset] = {
                        "Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) and dataset_data['Score'].iloc[0] != '-' else 0.0,
                        "Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) and dataset_data['Cost($)'].iloc[0] != '-' else 0.0
                    }
                else:
                    # If the dataset is empty, ensure the key exists and set default values
                    result['results'][algo_key][dataset] = {
                        "Score": 0.0,
                        "Cost($)": 0.0
                    }


    # Save as JSON file
    with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    # Generate JSON files in two formats
    process_csv_to_json()
    process_csv_to_overall_json()