Spaces:

vectara
/

leaderboard

Running on CPU Upgrade

File size: 4,019 Bytes

import logging
import pandas as pd

import src.envs as envs

from src.backend.model_operations import SummaryGenerator, EvaluationModel
import src.backend.util as util

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')


class Evaluator:
    """A class to evaluate summaries generated by a language model.

    Attributes:
        model (str): The name or path of the model.
        revision (str): The model revision.
        precision (str): The precision setting of the model.
        num_fewshot (int): Number of few-shot examples to use.
        batch_size (int): Batch size for processing.
        device (str): The device to run the model on.
        no_cache (bool): Flag to disable caching.
        limit (int): Limit on the number of items to process.
        write_out (bool): Whether to write results to a file.
        output_base_path (str): Base path for output files.
        summary_generator (SummaryGenerator): Instance for generating summaries.
        eval_model (EvaluationModel): Instance for evaluating summaries.
    """
    def __init__(self, model, revision, precision, batch_size,
                device, no_cache, limit, write_out=True,
                output_base_path='logs'):
        """Initializes the Evaluator with the given model and settings.

        Args:
            model (str): The name or path of the model.
            revision (str): The model revision.
            precision (str): The precision setting of the model.
            num_fewshot (int): Number of few-shot examples to use.
            batch_size (int): Batch size for processing.
            device (str): The device to run the model on.
            no_cache (bool): Flag to disable caching.
            limit (int): Limit on the number of items to process.
            write_out (bool): Whether to write results to a file.
            output_base_path (str): Base path for output files.
        """
        self.model = model
        self.revision = revision
        self.precision = precision
        self.batch_size = batch_size
        self.device = device
        self.no_cache = no_cache
        self.limit = limit
        self.write_out = write_out
        self.output_base_path = output_base_path
        try:
            self.summary_generator = SummaryGenerator(model, revision)
            self.eval_model = EvaluationModel(envs.HEM_PATH)
        except Exception as e:
            logging.error(f"Error initializing Evaluator: {e}")
            raise

    def evaluate(self):
        """
        Performs the evaluation process by generating summaries 
        and computing metrics.

        Returns:
            dict: A dictionary containing evaluation results.
        """
        try:
            df = pd.read_csv(envs.DATASET_PATH)
            generated_summaries_df = self.summary_generator.generate_summaries(df)

            avg_summary_len = self.summary_generator.avg_length
            answer_rate = self.summary_generator.answer_rate

            hallucination_scores = self.eval_model.evaluate_hallucination(
                generated_summaries_df)
            factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
            hallucination_rate = self.eval_model.hallucination_rate

            results = util.format_results(model_name=self.model, revision=self.revision,
                                        precision=self.precision,
                                        factual_consistency_rate=factual_consistency_rate,
                                        hallucination_rate=hallucination_rate,
                                        answer_rate=answer_rate,
                                        avg_summary_len=avg_summary_len)
            return results
        except FileNotFoundError:
            logging.error(f"File not found: {envs.DATASET_PATH}")
            raise
        except Exception as e:
            logging.error(f"Error during evaluation: {e}")
            raise