leaderboard / src /backend /evaluate_model.py
Miaoran000's picture
minor updates
5c4aa1e
raw
history blame
6.88 kB
import logging
import pandas as pd
import os
import csv
import src.envs as envs
from src.backend.model_operations import SummaryGenerator, EvaluationModel
import src.backend.util as util
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
class Evaluator:
"""A class to evaluate summaries generated by a language model.
Attributes:
model (str): The name or path of the model.
revision (str): The model revision.
precision (str): The precision setting of the model.
num_fewshot (int): Number of few-shot examples to use.
batch_size (int): Batch size for processing.
device (str): The device to run the model on.
no_cache (bool): Flag to disable caching.
limit (int): Limit on the number of items to process.
write_out (bool): Whether to write results to a file.
output_base_path (str): Base path for output files.
summary_generator (SummaryGenerator): Instance for generating summaries.
eval_model (EvaluationModel): Instance for evaluating summaries.
"""
def __init__(self, model, revision, precision, batch_size,
device, no_cache, limit, write_out=True,
output_base_path='logs'):
"""Initializes the Evaluator with the given model and settings.
Args:
model (str): The name or path of the model.
revision (str): The model revision.
precision (str): The precision setting of the model.
num_fewshot (int): Number of few-shot examples to use.
batch_size (int): Batch size for processing.
device (str): The device to run the model on.
no_cache (bool): Flag to disable caching.
limit (int): Limit on the number of items to process.
write_out (bool): Whether to write results to a file.
output_base_path (str): Base path for output files.
"""
self.model = model
self.revision = revision
self.precision = precision
self.batch_size = batch_size
self.device = device
self.no_cache = no_cache
self.limit = limit
self.write_out = write_out
self.output_base_path = output_base_path
try:
self.summary_generator = SummaryGenerator(model, revision)
self.eval_model = EvaluationModel(envs.HEM_PATH)
except Exception as e:
logging.error(f"Error initializing Evaluator: {e}")
raise
def evaluate(self):
"""
Performs the evaluation process by generating summaries
and computing metrics.
Returns:
dict: A dictionary containing evaluation results.
"""
try:
df = pd.read_csv(envs.DATASET_PATH)
# print(envs.DATASET_PATH)
# print(df.shape)
# print(df.iloc[-1])
self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
avg_summary_len = self.summary_generator.avg_length
answer_rate = self.summary_generator.answer_rate
self.hallucination_scores, self.eval_results = self.eval_model.evaluate_hallucination(
self.generated_summaries_df)
factual_consistency_rate = self.eval_model.compute_factual_consistency_rate()
hallucination_rate = self.eval_model.hallucination_rate
results = util.format_results(model_name=self.model, revision=self.revision,
precision=self.precision,
factual_consistency_rate=factual_consistency_rate,
hallucination_rate=hallucination_rate,
answer_rate=answer_rate,
avg_summary_len=avg_summary_len)
return results
except FileNotFoundError:
logging.error(f"File not found: {envs.DATASET_PATH}")
raise
except Exception as e:
logging.error(f"Error during evaluation: {e}")
raise
def write_results(self):
print('Updating result files')
leaderboard_path = os.getcwd() # the path of leaderboard folder
print(leaderboard_path)
working_path = os.path.join(leaderboard_path, 'Hallucination Leaderboard Results')
if not os.path.exists(working_path):
logging.error(f"Need to first download the results from google drive to the learderboard folder")
raise
source_summary_df = self.generated_summaries_df[["source", "summary"]]
#update leaderboard_summaries.csv
#first remove previous results for the current model
# existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8')
# mask = existing_df['model'] == self.model
# existing_df = existing_df[~mask]
# print(existing_df.shape)
# summary_doc = set(existing_df['model'].values.tolist())
# print(summary_doc)
# # get new result
leaderboard_summaries_df = source_summary_df
leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
print('leaderboard_summaries.csv has been updated')
# update leaderboard_summaries_with_scores.csv
# BUG: get error when opening the file
# existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
# encoding='utf-8', sep=",", quotechar='"', quoting=2)
# print(existing_df.shape)
# score_doc = set(existing_df['model'].values.tolist())
# print(score_doc)
# mask = existing_df['model'] == self.model
# existing_df = existing_df[~mask]
# # get new result
leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
print('leaderboard_summaries_with_scores.csv has been updated')
# for model in summary_doc:
# if model not in score_doc:
# print(f"{model} records missing in leaderboard_summaries_with_scores.csv")
# for model in score_doc:
# if model not in summary_doc:
# print(f"{model} records missing in leaderboard_summaries.csv")