Spaces:

yanolja
/

arena

Running

App Files Files Community

suhyun.kang commited on Feb 5, 2024

Commit

cc5a628

2 Parent(s): a19f11e 486e533

Merge branch 'main' of https://github.com/Y-IAB/arena into 1-elo

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +4 -2
app.py +52 -72
requirments.txt +22 -41

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 venv
 *.log

 venv
 *.log
+__pycache__

README.md CHANGED Viewed

@@ -19,7 +19,9 @@
    Set your OpenAI API key as an environment variable and start the application:
    ```shell
-   GCP_PROJECT_ID=<your project id> OPENAI_API_KEY=<your key> python3 app.py
    ```
-   Replace <your project id> and <your key> with your GCP project ID and OpenAI API key respectively.

    Set your OpenAI API key as an environment variable and start the application:
    ```shell
+   OPENAI_API_KEY=<your key> python3 app.py
    ```
+   Replace `<your key>` with your GCP project ID.
+   > To run the app with [auto-reloading](https://www.gradio.app/guides/developing-faster-with-reload-mode), use `gradio app.py --demo-name app` instead of `python3 app.py`.

app.py CHANGED Viewed

@@ -3,23 +3,24 @@ It provides a platform for comparing the responses of two LLMs.
 """
 import enum
-import json
 from random import sample
 from uuid import uuid4
-from fastchat.serve import gradio_web_server
-from fastchat.serve.gradio_web_server import bot_response
 import firebase_admin
 from firebase_admin import firestore
 import gradio as gr
 from leaderboard import build_leaderboard
 db_app = firebase_admin.initialize_app()
 db = firestore.client()
 # TODO(#1): Add more models.
-SUPPORTED_MODELS = ["gpt-4", "gpt-4-turbo", "gpt-3.5-turbo", "gemini-pro"]
 # TODO(#4): Add more languages.
 SUPPORTED_TRANSLATION_LANGUAGES = ["Korean", "English"]
@@ -36,23 +37,20 @@ class VoteOptions(enum.Enum):
   TIE = "Tie"
-def vote(state_a, state_b, vote_button, res_type, source_lang, target_lang):
   doc_id = uuid4().hex
   winner = VoteOptions(vote_button).name.lower()
-  # The 'messages' field in the state is an array of arrays, which is
-  # not supported by Firestore. Therefore, we convert it to a JSON string.
-  model_a_conv = json.dumps(state_a.dict())
-  model_b_conv = json.dumps(state_b.dict())
   if res_type == ResponseType.SUMMARIZE.value:
     doc_ref = db.collection("arena-summarizations").document(doc_id)
     doc_ref.set({
         "id": doc_id,
-        "model_a": state_a.model_name,
-        "model_b": state_b.model_name,
-        "model_a_conv": model_a_conv,
-        "model_b_conv": model_b_conv,
         "winner": winner,
         "timestamp": firestore.SERVER_TIMESTAMP
     })
@@ -62,10 +60,11 @@ def vote(state_a, state_b, vote_button, res_type, source_lang, target_lang):
     doc_ref = db.collection("arena-translations").document(doc_id)
     doc_ref.set({
         "id": doc_id,
-        "model_a": state_a.model_name,
-        "model_b": state_b.model_name,
-        "model_a_conv": model_a_conv,
-        "model_b_conv": model_b_conv,
         "source_language": source_lang.lower(),
         "target_language": target_lang.lower(),
         "winner": winner,
@@ -73,42 +72,38 @@ def vote(state_a, state_b, vote_button, res_type, source_lang, target_lang):
     })
-def user(user_prompt):
-  model_pair = sample(SUPPORTED_MODELS, 2)
-  new_state_a = gradio_web_server.State(model_pair[0])
-  new_state_b = gradio_web_server.State(model_pair[1])
-  for state in [new_state_a, new_state_b]:
-    state.conv.append_message(state.conv.roles[0], user_prompt)
-    state.conv.append_message(state.conv.roles[1], None)
-    state.skip_next = False
-  return [
-      new_state_a, new_state_b, new_state_a.model_name, new_state_b.model_name
-  ]
-def bot(state_a, state_b, request: gr.Request):
-  new_states = [state_a, state_b]
   generators = []
-  for state in new_states:
     try:
       # TODO(#1): Allow user to set configuration.
-      # bot_response returns a generator yielding states.
-      generator = bot_response(state,
-                               temperature=0.9,
-                               top_p=0.9,
-                               max_new_tokens=100,
-                               request=request)
-      generators.append(generator)
     # TODO(#1): Narrow down the exception type.
     except Exception as e:  # pylint: disable=broad-except
       print(f"Error in bot_response: {e}")
       raise e
-  new_responses = [None, None]
   # It simulates concurrent response generation from two models.
   while True:
@@ -118,19 +113,14 @@ def bot(state_a, state_b, request: gr.Request):
       try:
         yielded = next(generators[i])
-        # The generator yields a tuple, with the new state as the first item.
-        new_state = yielded[0]
-        new_states[i] = new_state
-        # The last item from 'messages' represents the response to the prompt.
-        bot_message = new_state.conv.messages[-1]
-        # Each message in conv.messages is structured as [role, message],
-        # so we extract the last message component.
-        new_responses[i] = bot_message[-1]
         stop = False
       except StopIteration:
         pass
@@ -139,8 +129,6 @@ def bot(state_a, state_b, request: gr.Request):
         print(f"Error in generator: {e}")
         raise e
-    yield new_states + new_responses
     if stop:
       break
@@ -176,36 +164,22 @@ with gr.Blocks() as app:
                                [source_language, target_language])
   model_names = [gr.State(None), gr.State(None)]
-  responses = [gr.State(None), gr.State(None)]
-  # states stores FastChat-specific conversation states.
-  states = [gr.State(None), gr.State(None)]
   prompt = gr.TextArea(label="Prompt", lines=4)
   submit = gr.Button()
   with gr.Row():
-    responses[0] = gr.Textbox(label="Model A", interactive=False)
-    responses[1] = gr.Textbox(label="Model B", interactive=False)
   # TODO(#5): Display it only after the user submits the prompt.
   # TODO(#6): Block voting if the response_type is not set.
   # TODO(#6): Block voting if the user already voted.
   with gr.Row():
     option_a = gr.Button(VoteOptions.MODEL_A.value)
-    option_a.click(
-        vote, states +
-        [option_a, response_type_radio, source_language, target_language])
     option_b = gr.Button("Model B is better")
-    option_b.click(
-        vote, states +
-        [option_b, response_type_radio, source_language, target_language])
     tie = gr.Button("Tie")
-    tie.click(
-        vote,
-        states + [tie, response_type_radio, source_language, target_language])
   # TODO(#7): Hide it until the user votes.
   with gr.Accordion("Show models", open=False):
@@ -213,8 +187,14 @@ with gr.Blocks() as app:
       model_names[0] = gr.Textbox(label="Model A", interactive=False)
       model_names[1] = gr.Textbox(label="Model B", interactive=False)
-  submit.click(user, prompt, states + model_names,
-               queue=False).then(bot, states, states + responses)
   build_leaderboard(db)

 """
 import enum
 from random import sample
 from uuid import uuid4
 import firebase_admin
 from firebase_admin import firestore
 import gradio as gr
+from litellm import completion
 from leaderboard import build_leaderboard
+# TODO(#21): Fix auto-reload issue related to the initialization of Firebase.
 db_app = firebase_admin.initialize_app()
 db = firestore.client()
 # TODO(#1): Add more models.
+SUPPORTED_MODELS = [
+    "gpt-4", "gpt-4-0125-preview", "gpt-3.5-turbo", "gemini-pro"
+]
 # TODO(#4): Add more languages.
 SUPPORTED_TRANSLATION_LANGUAGES = ["Korean", "English"]
   TIE = "Tie"
+def vote(vote_button, response_a, response_b, model_a_name, model_b_name,
+         user_prompt, res_type, source_lang, target_lang):
   doc_id = uuid4().hex
   winner = VoteOptions(vote_button).name.lower()
   if res_type == ResponseType.SUMMARIZE.value:
     doc_ref = db.collection("arena-summarizations").document(doc_id)
     doc_ref.set({
         "id": doc_id,
+        "prompt": user_prompt,
+        "model_a": model_a_name,
+        "model_b": model_b_name,
+        "model_a_response": response_a,
+        "model_b_response": response_b,
         "winner": winner,
         "timestamp": firestore.SERVER_TIMESTAMP
     })
     doc_ref = db.collection("arena-translations").document(doc_id)
     doc_ref.set({
         "id": doc_id,
+        "prompt": user_prompt,
+        "model_a": model_a_name,
+        "model_b": model_b_name,
+        "model_a_response": response_a,
+        "model_b_response": response_b,
         "source_language": source_lang.lower(),
         "target_language": target_lang.lower(),
         "winner": winner,
     })
+def response_generator(response: str):
+  for part in response:
+    content = part.choices[0].delta.content
+    if content is None:
+      continue
+    # To simulate a stream, we yield each character of the response.
+    for character in content:
+      yield character
+def get_responses(user_prompt):
+  models = sample(SUPPORTED_MODELS, 2)
   generators = []
+  for model in models:
     try:
       # TODO(#1): Allow user to set configuration.
+      response = completion(model=model,
+                            messages=[{
+                                "content": user_prompt,
+                                "role": "user"
+                            }],
+                            stream=True)
+      generators.append(response_generator(response))
     # TODO(#1): Narrow down the exception type.
     except Exception as e:  # pylint: disable=broad-except
       print(f"Error in bot_response: {e}")
       raise e
+  responses = ["", ""]
   # It simulates concurrent response generation from two models.
   while True:
       try:
         yielded = next(generators[i])
+        if yielded is None:
+          continue
+        responses[i] += yielded
         stop = False
+        yield responses + models
       except StopIteration:
         pass
         print(f"Error in generator: {e}")
         raise e
     if stop:
       break
                                [source_language, target_language])
   model_names = [gr.State(None), gr.State(None)]
+  response_boxes = [gr.State(None), gr.State(None)]
   prompt = gr.TextArea(label="Prompt", lines=4)
   submit = gr.Button()
   with gr.Row():
+    response_boxes[0] = gr.Textbox(label="Model A", interactive=False)
+    response_boxes[1] = gr.Textbox(label="Model B", interactive=False)
   # TODO(#5): Display it only after the user submits the prompt.
   # TODO(#6): Block voting if the response_type is not set.
   # TODO(#6): Block voting if the user already voted.
   with gr.Row():
     option_a = gr.Button(VoteOptions.MODEL_A.value)
     option_b = gr.Button("Model B is better")
     tie = gr.Button("Tie")
   # TODO(#7): Hide it until the user votes.
   with gr.Accordion("Show models", open=False):
       model_names[0] = gr.Textbox(label="Model A", interactive=False)
       model_names[1] = gr.Textbox(label="Model B", interactive=False)
+  submit.click(get_responses, prompt, response_boxes + model_names)
+  common_inputs = response_boxes + model_names + [
+      prompt, response_type_radio, source_language, target_language
+  ]
+  option_a.click(vote, [option_a] + common_inputs)
+  option_b.click(vote, [option_b] + common_inputs)
+  tie.click(vote, [tie] + common_inputs)
   build_leaderboard(db)

requirments.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-accelerate==0.26.1
 aiofiles==23.2.1
 aiohttp==3.9.3
 aiosignal==1.3.1
@@ -6,9 +5,9 @@ altair==5.2.0
 annotated-types==0.6.0
 anyio==4.2.0
 attrs==23.2.0
-CacheControl==0.13.1
 cachetools==5.3.2
-certifi==2023.11.17
 cffi==1.16.0
 charset-normalizer==3.3.2
 click==8.1.7
@@ -17,76 +16,67 @@ contourpy==1.2.0
 cryptography==42.0.2
 cycler==0.12.1
 distro==1.9.0
-fastapi==0.109.0
 ffmpy==0.3.1
 filelock==3.13.1
 firebase-admin==6.4.0
 fonttools==4.47.2
 frozenlist==1.4.1
-fschat==0.2.35
-fsspec==2023.12.2
-google-api-core==2.16.1
 google-api-python-client==2.116.0
 google-auth==2.27.0
 google-auth-httplib2==0.2.0
-google-cloud-aiplatform==1.40.0
-google-cloud-bigquery==3.17.1
 google-cloud-core==2.4.1
 google-cloud-firestore==2.14.0
-google-cloud-resource-manager==1.11.0
 google-cloud-storage==2.14.0
 google-crc32c==1.5.0
 google-resumable-media==2.7.0
 googleapis-common-protos==1.62.0
-gradio==3.50.2
-gradio_client==0.6.1
-grpc-google-iam-v1==0.13.0
-grpcio==1.60.0
-grpcio-status==1.60.0
 h11==0.14.0
 httpcore==1.0.2
 httplib2==0.22.0
 httpx==0.26.0
 huggingface-hub==0.20.3
 idna==3.6
 importlib-resources==6.1.1
 Jinja2==3.1.3
 jsonschema==4.21.1
 jsonschema-specifications==2023.12.1
 kiwisolver==1.4.5
 markdown-it-py==3.0.0
-markdown2==2.4.12
-MarkupSafe==2.1.4
 matplotlib==3.8.2
 mdurl==0.1.2
-mpmath==1.3.0
 msgpack==1.0.7
-multidict==6.0.4
-networkx==3.2.1
-nh3==0.2.15
 numpy==1.26.3
-openai==0.28.0
-orjson==3.9.12
 packaging==23.2
 pandas==2.2.0
-peft==0.8.1
 pillow==10.2.0
-prompt-toolkit==3.0.43
 proto-plus==1.23.0
 protobuf==4.25.2
-psutil==5.9.8
 pyasn1==0.5.1
 pyasn1-modules==0.3.0
 pycparser==2.21
-pydantic==1.10.14
 pydantic_core==2.16.1
 pydub==0.25.1
 Pygments==2.17.2
 PyJWT==2.8.0
 pyparsing==3.1.1
 python-dateutil==2.8.2
-python-multipart==0.0.6
-pytz==2023.4
 PyYAML==6.0.1
 referencing==0.33.0
 regex==2023.12.25
@@ -94,32 +84,23 @@ requests==2.31.0
 rich==13.7.0
 rpds-py==0.17.1
 rsa==4.9
-ruff==0.1.15
-safetensors==0.4.2
 semantic-version==2.10.0
-sentencepiece==0.1.99
-shapely==2.0.2
 shellingham==1.5.4
-shortuuid==1.0.11
 six==1.16.0
 sniffio==1.3.0
-starlette==0.35.1
-svgwrite==1.4.3
-sympy==1.12
 tiktoken==0.5.2
 tokenizers==0.15.1
 tomlkit==0.12.0
 toolz==0.12.1
-torch==2.2.0
 tqdm==4.66.1
-transformers==4.37.2
 typer==0.9.0
 typing_extensions==4.9.0
 tzdata==2023.4
 uritemplate==4.1.1
 urllib3==2.2.0
 uvicorn==0.27.0.post1
-wavedrom==2.0.3.post3
-wcwidth==0.2.13
 websockets==11.0.3
 yarl==1.9.4

 aiofiles==23.2.1
 aiohttp==3.9.3
 aiosignal==1.3.1
 annotated-types==0.6.0
 anyio==4.2.0
 attrs==23.2.0
+CacheControl==0.14.0
 cachetools==5.3.2
+certifi==2024.2.2
 cffi==1.16.0
 charset-normalizer==3.3.2
 click==8.1.7
 cryptography==42.0.2
 cycler==0.12.1
 distro==1.9.0
+fastapi==0.109.2
 ffmpy==0.3.1
 filelock==3.13.1
 firebase-admin==6.4.0
 fonttools==4.47.2
 frozenlist==1.4.1
+fsspec==2024.2.0
+google-api-core==2.16.2
 google-api-python-client==2.116.0
 google-auth==2.27.0
 google-auth-httplib2==0.2.0
 google-cloud-core==2.4.1
 google-cloud-firestore==2.14.0
 google-cloud-storage==2.14.0
 google-crc32c==1.5.0
 google-resumable-media==2.7.0
 googleapis-common-protos==1.62.0
+gradio==4.16.0
+gradio_client==0.8.1
+grpcio==1.60.1
+grpcio-status==1.60.1
 h11==0.14.0
 httpcore==1.0.2
 httplib2==0.22.0
 httpx==0.26.0
 huggingface-hub==0.20.3
 idna==3.6
+importlib-metadata==7.0.1
 importlib-resources==6.1.1
 Jinja2==3.1.3
 jsonschema==4.21.1
 jsonschema-specifications==2023.12.1
 kiwisolver==1.4.5
+litellm==1.22.3
 markdown-it-py==3.0.0
+MarkupSafe==2.1.5
 matplotlib==3.8.2
 mdurl==0.1.2
 msgpack==1.0.7
+multidict==6.0.5
 numpy==1.26.3
+openai==1.11.1
+orjson==3.9.13
 packaging==23.2
 pandas==2.2.0
 pillow==10.2.0
 proto-plus==1.23.0
 protobuf==4.25.2
 pyasn1==0.5.1
 pyasn1-modules==0.3.0
 pycparser==2.21
+pydantic==2.6.0
 pydantic_core==2.16.1
 pydub==0.25.1
 Pygments==2.17.2
 PyJWT==2.8.0
 pyparsing==3.1.1
 python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-multipart==0.0.7
+pytz==2024.1
 PyYAML==6.0.1
 referencing==0.33.0
 regex==2023.12.25
 rich==13.7.0
 rpds-py==0.17.1
 rsa==4.9
+ruff==0.2.0
 semantic-version==2.10.0
 shellingham==1.5.4
 six==1.16.0
 sniffio==1.3.0
+starlette==0.36.3
 tiktoken==0.5.2
 tokenizers==0.15.1
 tomlkit==0.12.0
 toolz==0.12.1
 tqdm==4.66.1
 typer==0.9.0
 typing_extensions==4.9.0
 tzdata==2023.4
 uritemplate==4.1.1
 urllib3==2.2.0
 uvicorn==0.27.0.post1
 websockets==11.0.3
 yarl==1.9.4
+zipp==3.17.0