backend_demo

Paused

App Files Files Community

Shaltiel commited on Mar 28

Commit

fe84b5e

•

1 Parent(s): 7135a84

Added translation task

Browse files

Files changed (3) hide show

custom_tasks.py +2 -1
src/about.py +1 -1
src/custom_tasks/translation_task.py +55 -0

custom_tasks.py CHANGED Viewed

@@ -9,11 +9,12 @@ Author:
 from src.custom_tasks.heq_task import *
 from src.custom_tasks.sentiment_task import *
 from src.custom_tasks.winograd_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

 from src.custom_tasks.heq_task import *
 from src.custom_tasks.sentiment_task import *
 from src.custom_tasks.winograd_task import *
+from src.custom_tasks.translation_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, translation_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

src/about.py CHANGED Viewed

@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
 TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'

 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu']
 TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'

src/custom_tasks/translation_task.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import re
+import string
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.metrics import Metrics, MetricCategory
+from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
+from aenum import extend_enum
+import numpy as np
+from lighteval.tasks.requests import Doc
+from Levenshtein import distance
+import collections
+from lighteval.utils import as_list
+import sacrebleu
+def trans_prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"].strip(),
+        choices=[line["response"][0].strip()],
+        gold_index=[0],
+        instruction="",
+    )
+def translation_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
+    if len(predictions)  > 1:
+        raise ValueError("Predictions should have one item")
+    return float(sacrebleu.sentence_bleu(hypothesis=predictions[0], references=golds).score / 100)
+sentence_bleu = CorpusLevelMetric(
+    metric="sentence_bleu",
+    sample_level_fn=translation_eval_fn,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.TRANSLATION,
+    corpus_level_fn=np.mean,
+    higher_is_better=True,
+)
+extend_enum(Metrics, 'sentence_bleu', sentence_bleu)
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+translation_task = LightevalTaskConfig(
+    name="he-en-trans-bleu",
+    prompt_function="trans_prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="dicta-hebrew-llm-leaderboard/tests",
+    hf_subset="default",
+    hf_avail_splits=["en2he", "he2en"],
+    evaluation_splits=["en2he", "he2en"],
+    metric=['sentence_bleu', 'bleu_1', 'bleu_4'],
+    stop_sequence=['\n'],
+    generation_size=220
+)