mqliu
/

InterleavedBench

English

Model card Files Files and versions Community

mqliu commited on Aug 5

Commit

ce1a7c1

•

1 Parent(s): b1853c2

Upload src/interleavedeval_gpt4o.py with huggingface_hub

Browse files

Files changed (1) hide show

src/interleavedeval_gpt4o.py +239 -0

src/interleavedeval_gpt4o.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import base64
+import requests
+import json
+import pdb
+import time
+from tqdm import tqdm
+import copy
+import random
+# ----------------------------------
+# OpenAI API Key
+API_KEY = "<your_openai_key"
+IMAGE_TOKEN = "<image>"
+MODEL_VERSION = "gpt-4o"    # choose different openai model version
+import argparse
+parser = argparse.ArgumentParser(description="Input file")
+parser.add_argument('--file_name', type=str, help='evaluation file name')
+args = parser.parse_args()
+# path to model's prediction file
+input_path = f"<predition_dir>/{args.file_name}.json"
+# path to evaluation results with GPT-4o
+output_path = f"<eval_dir>/{args.file_name}_interleaved_eval.json"
+with open(input_path, "r") as file:
+    data = json.load(file)
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+eval_instructions = {
+    "text_quality": "In this task, you need to evaluate the text quality of the given output . Text quality is how clear, coherent, and error-free the OUTPUT text is. Consider grammar, spelling, readability, coherence with context (i.e., task instructions and inputs), and whether it’s duplicate as input. This evaluation considers grammar, spelling, punctuation, readability, the logical flow of ideas, and you should substantially penalize duplication, i.e., the output text is severely duplicate with the instruction or input text. Provide your rating on a scale from 1 to 5 based on the criteria below: A rating of 1 indicates the text quality is extremely poor, characterized by frequent grammatical errors, numerous spelling mistakes, poor punctuation, very difficult readability, a lack of logical flow and coherence, and largely contains duplicated content from the input. A rating of 2 indicates somewhat poor text quality, with several grammatical errors, noticeable spelling mistakes, inconsistent punctuation, hard-to-read content, and inconsistent flow and coherence. A rating of 3 indicates somehow acceptable text quality, with occasional grammatical errors, few spelling mistakes, mostly correct punctuation, generally readable content that may require some effort, basic logical flow and coherence, and contains some duplicated content from the input. A rating of 4 indicates somewhat good text quality, with rare grammatical errors, very few spelling mistakes, correct punctuation, easy-to-read and understand content, and mostly logical flow and coherent structure. A rating of 5 indicates extremely good text quality, with no grammatical errors, no spelling mistakes, perfect punctuation, very easy-to-read and understand content, excellent logical flow and coherence, and do not contain duplicated content from the input.\n",
+    "perceptual_quality": "In this task, you need to evaluate the image quality of the images. Image quality measures the extent to which a generated image appears visually convincing, natural, and free from distortions or artifacts. This evaluation considers how accurately the image mimics reality without unnatural disruptions in its structure, colors, or composition. Provide your rating on a scale from 1 to 5 based on the criteria below: A rating of 1 indicates the image quality is extremely poor, characterized by significant visual issues such as heavy distortions, numerous artifacts, unnatural colors, and structural disruptions, making the image appear highly unrealistic and unconvincing. A rating of 2 indicates somewhat poor image quality, with noticeable visual issues including distortions, artifacts, and color inaccuracies, as well as some structural disruptions, leading to an image that appears somewhat unrealistic and unconvincing. A rating of 3 indicates somehow acceptable image quality, with occasional visual issues such as minor distortions, artifacts, and slight color inaccuracies, but generally maintaining a structure that mimics reality to an acceptable extent. A rating of 4 indicates somewhat good image quality, with rare visual issues, minimal distortions, few artifacts, and accurate colors, maintaining a mostly natural and convincing appearance with very few structural disruptions. A rating of 5 indicates extremely good image quality, with no visual issues, no distortions or artifacts, perfect color accuracy, and a completely natural and convincing appearance without any structural disruptions, accurately mimicking reality. \n",
+    "image_coherence": "In this task, you need to evaluate the image coherence of the input interleaved content that consists of both text and multiple images. Image coherence includes but not limit to the following perspectives: (1) Style coherence: Whether the selection of textures, color palette, lighting setups, brush techniques, or rendering styles can give the image a coherent look and feel. (2) Subject coherence: the consistency in subject’s representation includes maintaining recognizable physical attributes, clothing, and behavioral traits across various scenes. You should penalize for image duplication, where the output images are too similar/replicate with the input images, or within the output images themselves. Provide your rating in the scale from 1 to 5 based on the criteria below: A rating of 1 indicates the coherence is extremely poor, characterized by a complete lack of style coherence with mismatched textures, color palettes, lighting setups, and rendering styles, along with subject incoherence where physical attributes, clothing, and behavioral traits are inconsistent and unrecognizable across scenes. Output images are significantly duplicated with each other or with input images. A rating of 2 indicates somewhat poor coherence, with noticeable inconsistencies in style coherence, including textures, color palettes, lighting setups, and rendering styles, and subject coherence, where physical attributes, clothing, and behavioral traits are often inconsistent and difficult to recognize across scenes. A rating of 3 indicates somehow acceptable coherence, where there are occasional inconsistencies in style coherence, including textures, color palettes, lighting setups, and rendering styles, and subject coherence, where physical attributes, clothing, and behavioral traits are generally consistent but may have minor variations. Images are duplicated to some extent but acceptable. A rating of 4 indicates somewhat good coherence, with rare inconsistencies in style coherence, including textures, color palettes, lighting setups, and rendering styles, and subject coherence, where physical attributes, clothing, and behavioral traits are mostly consistent and recognizable across scenes. A rating of 5 indicates extremely good coherence, with perfect style coherence, including textures, color palettes, lighting setups, and rendering styles, and subject coherence, where physical attributes, clothing, and behavioral traits are consistently maintained and easily recognizable across all scenes. No image duplication. \n",
+    "text_image_coherence": "In this task, you need to evaluate the text-to-image coherence of the input interleaved content that consists of both text and multiple images. Text-to-image coherence refers to the alignment and harmonious integration between textual and visual elements. This coherence goes beyond mere semantic similarity, aiming to enhance the quality, coherence, and communicative effectiveness of the content. It encompasses how text and images align with each other, conveying a unified and cohesive narrative. You should perform the evaluation in a pairwise manner, i.e., consider each text-image pair one by one and accumulate the scores. Provide your rating on a scale from 1 to 5 based on the criteria below: A rating of 1 indicates the text-to-image coherence is extremely poor, with text and images showing significant misalignment and no meaningful connection, resulting in a disjointed and confusing narrative. A rating of 2 indicates somewhat poor coherence, with noticeable misalignment between text and images, where the connections are weak and the narrative is only partially cohesive. A rating of 3 indicates somehow acceptable coherence, with occasional misalignment between text and images, where the connections are generally clear but may have minor inconsistencies, leading to a mostly cohesive narrative. A rating of 4 indicates somewhat good coherence, with strong alignment between text and images, where the connections are clear and consistent, resulting in a cohesive and well-integrated narrative. A rating of 5 indicates extremely good coherence, with perfect alignment between text and images, where the connections are seamless and fully integrated, resulting in a highly cohesive and effective narrative. \n",
+    "helpfulness": "In this task, you need to evaluate the helpfulness aspect of the input interleaved content that consists of both text and multiple images. Helpfulness measures how well the output text and images follow the task instruction and provide complete information to achieve the task. Also, consider whether the outputs follow a reasonable logic flow. Provide your rating on a scale from 1 to 5 based on the criteria below: A rating of 1 indicates the helpfulness is extremely poor, with output text and images being totally not following the task instruction, lacking logical flow, and failing to provide necessary information, resulting in an incomplete and unhelpful content. A rating of 2 indicates somewhat poor helpfulness, with output text and images being partially follow the task instruction, showing inconsistent logical flow, and providing limited information, leading to a content that is somewhat incomplete and not very helpful. A rating of 3 indicates somehow acceptable helpfulness, with output text and images being generally follow the instruction, maintaining a reasonable logical flow, and providing a fair amount of necessary information. A rating of 4 indicates somewhat good helpfulness, with output text and images following the task instruction and a clear logical flow, and providing most of the necessary information, resulting in a content that is mostly complete and helpful. A rating of 5 indicates extremely good helpfulness, with output text and images accurately following the instruction, following a natural and logical flow, and providing all necessary information, resulting in a content that is complete and highly helpful. \n",
+}
+headers = {
+  "Content-Type": "application/json",
+  "Authorization": f"Bearer {API_KEY}"
+}
+# ---------------------------------
+score_collection = {}
+aspect_list = []
+for k in eval_instructions.keys():
+    aspect_list.append(k)
+    score_collection[k] = 0
+cnt = 0
+rated_instances = []
+def find_first_occurrence(s):
+    for index, char in enumerate(s):
+        if char in '12345':
+            return index, char
+    return -1, None
+for instance in tqdm(data):
+    text_flag = True
+    image_flag = True
+    instance["eval_scores"] = {}
+    valid_aspects = copy.deepcopy(aspect_list)
+    removed_aspects = []
+    # process the context and prediction from the model output file
+    input_text = instance["input_text"]     # the input instruction and (optionally) the context
+    num_input_images = input_text.count("<image>")
+    images = instance["image"]
+    input_images = images[:num_input_images]
+    output_images = images[num_input_images:]
+    if len(output_images) == 0:
+        image_flag = False
+    input_text = instance["input_text"]
+    output_text = instance["output_text"].replace("\n### Assistant:", "")   # the prediction text. note that it must contain <image> special tokens and the amount needs to be the same with generated images
+    output_text = output_text.strip()
+    if len(output_text) == 0:
+        text_flag = False
+    if image_flag is False and text_flag is False:
+        removed_aspects = valid_aspects
+        valid_aspects = []
+    elif image_flag is False and text_flag is True:
+        valid_aspects = ["text_quality", "helpfulness"]
+        removed_aspects = ["perceptual_quality", "image_coherence", "text_image_coherence"]
+    elif image_flag is True and text_flag is False:
+        valid_aspects = ["perceptual_quality", "image_coherence", "helpfulness"]
+        removed_aspects = ["text_quality", "text_image_coherence"]
+    for rm_aspect in removed_aspects:
+        instance["eval_scores"][rm_aspect] = [0, "None. Empty output."]
+        score_collection[rm_aspect] += 0
+    input_image_conent = []
+    output_image_conent = []
+    if len(input_images) > 0:
+        for img_p in input_images:
+            enc = encode_image(img_p)
+            img_dict = {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{enc}"
+                }
+            }
+            input_image_conent.append(img_dict)
+    if len(output_images) > 0:
+        for img_p in output_images:
+            enc = encode_image(img_p)
+            img_dict = {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{enc}"
+                }
+            }
+            output_image_conent.append(img_dict)
+    for valid_asp in valid_aspects:
+        eval_content_list = []
+        if valid_asp == "text_quality":
+            content_input = f"Here is the task instruction and the input text (which you should NOT consider their quality in your evaluation): <INPUT> {input_text} <\INPUT>\n. Based on the instruction and the input, you need to evaluate the text quality of the: <OUTPUT> {output_text} <\OUTPUT>. Please make sure that your only loot at the text inside <OUTPUT> and <\OUTPUT>. You should only predict the numerical score. Also give a short explanation after your rating. Now give your numerical rating and evaluation: "
+            text_dict = {
+                "type": "text",
+                "text": content_input
+            }
+            eval_content_list = [text_dict]
+        elif valid_asp == "perceptual_quality":
+            content_input = f"Here are the images you need to evaluate. You should only predict the numerical score. Also give a short explanation after your rating. Now give your numerical rating and evaluation: "
+            text_dict = {
+                "type": "text",
+                "text": content_input
+            }
+            eval_content_list = [text_dict] + output_image_conent
+        elif valid_asp == "image_coherence":
+            content_input = f"Here is the task instruction and the input text and images (which you should NOT consider their quality in your evaluation): <INPUT> {input_text} <\INPUT>\n."
+            content_output = f"Based on the instruction and the input images, you need to evaluate the image coherence of the output images in the following. Please make sure that your only loot at the output images to rate whether they are coherent with input images without duplication. You should only predict the numerical score. Also give a short explanation after your rating. Now give your numerical rating and evaluation: "
+            text_dict_in = {
+                "type": "text",
+                "text": content_input
+            }
+            text_dict_out = {
+                "type": "text",
+                "text": content_output
+            }
+            eval_content_list = [text_dict_in] + input_image_conent + [text_dict_out] + output_image_conent
+        elif valid_asp == "text_image_coherence":
+            content_input = f"Here is the task instruction and the input text and images (which you should NOT consider their quality in your evaluation): <INPUT> {input_text} <\INPUT>\n."
+            content_output = f"Based on the instruction, you need to evaluate the coherence following output text {output_text} and images in a pairwise manner. Please make sure that your only loot at the output text and images to rate whether they are coherent. You should only predict the numerical score. Also give a short explanation after your rating. Now give your numerical rating and evaluation: "
+            text_dict_in = {
+                "type": "text",
+                "text": content_input
+            }
+            text_dict_out = {
+                "type": "text",
+                "text": content_output
+            }
+            eval_content_list = [text_dict_in] + [text_dict_out] + output_image_conent
+        elif valid_asp == "helpfulness":
+            content_input = f"Here is the task instruction and the input text and images (which you should NOT consider their quality in your evaluation): <INPUT> {input_text} <\INPUT>\n."
+            content_output = f"Based on the instruction and the input, you need to evaluate the helpfulness of the following text {output_text} and images. You should only predict the numerical score. Also give a short explanation after your rating. Now give your numerical rating and evaluation: "
+            text_dict_in = {
+                "type": "text",
+                "text": content_input
+            }
+            text_dict_out = {
+                "type": "text",
+                "text": content_output
+            }
+            eval_content_list = [text_dict_in] + input_image_conent + [text_dict_out] + output_image_conent
+        eval_instruct = eval_instructions[valid_asp]
+        try:
+            payload = {
+            "model": MODEL_VERSION,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": eval_instruct
+                },
+                {
+                "role": "user",
+                "content": eval_content_list
+                }
+            ],
+            "max_tokens": 300
+            }
+            response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+            raw_eval = response.json()["choices"][0]["message"]["content"]
+            index, eval_score = find_first_occurrence(raw_eval)
+            eval_score = float(eval_score)
+            if eval_score == -1:
+                print("cannot detect score...")
+                continue
+            instance["eval_scores"][valid_asp] = [eval_score, raw_eval]
+            score_collection[valid_asp] += eval_score
+        except Exception as e:
+            print(f"An error occurred: {e}")
+    rated_instances.append(instance)
+    cnt += 1
+with open(output_path, 'w') as file:
+  json.dump(rated_instances, file, indent=4)
+for eval_aspect, score in score_collection.items():
+    avg_score = score / cnt
+    avg_score = round(avg_score, 2)
+    print(f"{eval_aspect}: ", avg_score)