Spaces:

bhavanishankarpullela
/

CoSTA

Running

+from pathlib import Path
+from huggingface_hub import create_repo, Repository
+import tempfile
+import subprocess
+import os
+import shutil
+import logging
+import re
+from urllib.parse import urlparse
+logger = logging.getLogger(__name__)
+GIT_UP_TO_DATE = "On branch main\nYour branch is up to date with 'origin/main'.\
+\n\nnothing to commit, working tree clean\n"
+COMMIT_PLACEHOLDER = "{COMMIT_PLACEHOLDER}"
+def get_git_tag(lib_path, commit_hash):
+    # check if commit has a tag, see: https://stackoverflow.com/questions/1474115/how-to-find-the-tag-associated-with-a-given-git-commit
+    command = f"git describe --exact-match {commit_hash}"
+    output = subprocess.run(command.split(),
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            encoding="utf-8",
+            cwd=lib_path,
+            env=os.environ.copy(),
+        )
+    tag = output.stdout.strip()
+    if re.match(r"v\d*\.\d*\.\d*", tag) is not None:
+        return tag
+    else:
+        return None
+def copy_recursive(source_base_path, target_base_path):
+    """Copy directory recursively and overwrite existing files."""
+    for item in source_base_path.iterdir():
+        target_path = target_base_path / item.name
+        if item.is_dir():
+            target_path.mkdir(exist_ok=True)
+            copy_recursive(item, target_path)
+        else:
+            shutil.copy(item, target_path)
+def update_evaluate_dependency(requirements_path, commit_hash):
+    """Updates the evaluate requirement with the latest commit."""
+    with open(requirements_path, "r") as f:
+        file_content = f.read()
+    file_content = file_content.replace(COMMIT_PLACEHOLDER, commit_hash)
+    with open(requirements_path, "w") as f:
+        f.write(file_content)
+def push_module_to_hub(module_path, type, token, commit_hash, tag=None):
+    module_name = module_path.stem
+    org = f"evaluate-{type}"
+    repo_url = create_repo(org + "/" + module_name, repo_type="space", space_sdk="gradio", exist_ok=True, token=token)
+    repo_path = Path(tempfile.mkdtemp())
+    scheme = urlparse(repo_url).scheme
+    repo_url = repo_url.replace(f"{scheme}://", f"{scheme}://user:{token}@")
+    clean_repo_url = re.sub(r"(https?)://.*@", r"\1://", repo_url)
+    try:
+        subprocess.run(
+            f"git clone {repo_url}".split(),
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            check=True,
+            encoding="utf-8",
+            cwd=repo_path,
+            env=os.environ.copy(),
+        )
+    except OSError:
+        # make sure we don't accidentally expose token
+        raise OSError(f"Could not clone from '{clean_repo_url}'")
+    repo = Repository(local_dir=repo_path / module_name, use_auth_token=token)
+    copy_recursive(module_path, repo_path / module_name)
+    update_evaluate_dependency(repo_path / module_name / "requirements.txt", commit_hash)
+    repo.git_add()
+    try:
+        repo.git_commit(f"Update Space (evaluate main: {commit_hash[:8]})")
+        repo.git_push()
+        logger.info(f"Module '{module_name}' pushed to the hub")
+    except OSError as error:
+        if str(error) == GIT_UP_TO_DATE:
+            logger.info(f"Module '{module_name}' is already up to date.")
+        else:
+            raise error
+    if tag is not None:
+        repo.add_tag(tag, message="add evaluate tag", remote="origin")
+    shutil.rmtree(repo_path)
+if __name__ == "__main__":
+    evaluation_paths = ["metrics", "comparisons", "measurements"]
+    evaluation_types = ["metric", "comparison", "measurement"]
+    token = os.getenv("HF_TOKEN")
+    evaluate_lib_path = Path(os.getenv("EVALUATE_LIB_PATH"))
+    commit_hash = os.getenv("GIT_HASH")
+    git_tag = get_git_tag(evaluate_lib_path, commit_hash)
+    if git_tag is not None:
+        logger.info(f"Found tag: {git_tag}.")
+    for type, dir in zip(evaluation_types, evaluation_paths):
+        if (evaluate_lib_path/dir).exists():
+            for module_path in (evaluate_lib_path/dir).iterdir():
+                if module_path.is_dir():
+                    logger.info(f"Updating: module {module_path.name}.")
+                    push_module_to_hub(module_path, type, token, commit_hash, tag=git_tag)
+        else:
+            logger.warning(f"No folder {str(evaluate_lib_path/dir)} for {type} found.")

ST/evaluate/.github/hub/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ huggingface_hub

ST/evaluate/.github/workflows/build_documentation.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: Build documentation
+on:
+  push:
+    branches:
+      - main
+      - doc-builder*
+      - v*-release
+jobs:
+   build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: evaluate
+    secrets:
+      token: ${{ secrets.HUGGINGFACE_PUSH }}

ST/evaluate/.github/workflows/build_pr_documentation.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Build PR Documentation
+on:
+  pull_request:
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: evaluate

ST/evaluate/.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+name: CI
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+      - ci-*
+env:
+  HF_ALLOW_CODE_EVAL: 1
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: |
+          black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
+          isort --check-only tests src metrics comparisons measurements
+          flake8 tests src metrics
+  test:
+    needs: check_code_quality
+    strategy:
+      matrix:
+        test: ['unit', 'parity']
+        os: [ubuntu-latest, windows-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+      - name: Upgrade pip
+        run: python -m pip install --upgrade pip
+      - name: Install dependencies
+        run: |
+          pip install .[tests]
+          pip install -r additional-tests-requirements.txt --no-deps
+      - name: Test with pytest
+        if: ${{ matrix.test == 'unit' }}
+        run: |
+          python -m pytest -n 2 --dist loadfile -sv ./tests/ --ignore=./tests/test_trainer_evaluator_parity.py
+      - name: Integration test with transformers
+        if: ${{ matrix.test == 'parity' }}
+        run: |
+          python -m pytest -n 2 --dist loadfile -sv ./tests/test_trainer_evaluator_parity.py

ST/evaluate/.github/workflows/delete_doc_comment.yml ADDED Viewed

	@@ -0,0 +1,13 @@

+name: Delete dev documentation
+on:
+  pull_request:
+    types: [ closed ]
+jobs:
+  delete:
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
+    with:
+      pr_number: ${{ github.event.number }}
+      package: evaluate

ST/evaluate/.github/workflows/python-release.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Python release
+on:
+  push:
+    tags:
+      - v*
+env:
+  PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST }}
+jobs:
+  python_release:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip
+        pip install setuptools wheel
+    - run: python setup.py sdist bdist_wheel
+    - run: |
+        pip install twine
+    - name: Upload to PyPi
+      run: |
+          twine upload dist/* -u __token__ -p "$PYPI_TOKEN"

ST/evaluate/.github/workflows/update_spaces.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: Update Hub repositories
+on:
+  push:
+    branches:
+      - main
+jobs:
+  update-hub-repositories:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.7"
+      - name: Set up default Git config
+        run: |
+          git config --global user.name evaluate-bot
+          git config --global user.email leandro@huggingface.co
+      - name: Install dependencies
+        working-directory: ./.github/hub
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Update Hub repositories
+        working-directory: ./.github/hub
+        run: |
+          export HF_TOKEN=${{ secrets.HF_HUB_TOKEN }}
+          export EVALUATE_LIB_PATH=$GITHUB_WORKSPACE
+          export GIT_HASH=$GITHUB_SHA
+          export GIT_LFS_SKIP_SMUDGE=1
+          python push_evaluations_to_hub.py

ST/evaluate/.gitignore ADDED Viewed

	@@ -0,0 +1,64 @@

+# Locked files
+*.lock
+!dvc.lock
+# Extracted dummy data
+datasets/**/dummy_data-zip-extracted/
+# Compiled python modules.
+*.pyc
+# Byte-compiled
+_pycache__/
+.cache/
+# Python egg metadata, regenerated from source files by setuptools.
+*.egg-info
+.eggs/
+# PyPI distribution artifacts.
+build/
+dist/
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# pyenv
+.python-version
+# Tests
+.pytest_cache/
+# Other
+*.DS_Store
+# PyCharm/vscode
+.idea
+.vscode
+# keep only the empty datasets and metrics directory with it's __init__.py file
+/src/*/datasets/*
+!/src/*/datasets/__init__.py
+/src/*/metrics/*
+!/src/*/metrics/__init__.py
+# Vim
+.*.swp
+# playground
+/playground
+# Sphinx documentation
+docs/_build/
+docs/source/_build/
+# Benchmark results
+report.json
+report.md

ST/evaluate/AUTHORS ADDED Viewed

	@@ -0,0 +1,8 @@

+# This is the list of HuggingFace Datasets authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google Inc.
+HuggingFace Inc.

ST/evaluate/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
+at [https://www.contributor-covenant.org/translations][translations].
+[homepage]: https://www.contributor-covenant.org
+[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations

ST/evaluate/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,277 @@

+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# How to contribute to Evaluate
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+is not the only way to help the community. Answering questions, helping
+others, reaching out and improving the documentations are immensely valuable to
+the community.
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply star the repo to say "thank you".
+Whichever way you choose to contribute, please be mindful to respect our
+[code of conduct](https://github.com/huggingface/evaluate/blob/main/CODE_OF_CONDUCT.md).
+## You can contribute in so many ways!
+There are four ways you can contribute to `evaluate`:
+* Fixing outstanding issues with the existing code;
+* Implementing new evaluators and metrics;
+* Contributing to the examples and documentation;
+* Submitting issues related to bugs or desired new features.
+Open issues are tracked directly on the repository [here](https://github.com/huggingface/evaluate/issues).
+If you would like to work on any of the open issues:
+* Make sure it is not already assigned to someone else. The assignee (if any) is on the top right column of the Issue page. If it's not taken, self-assign it.
+* Work on your self-assigned issue and create a Pull Request!
+## Submitting a new issue or feature request
+Following these guidelines when submitting an issue or a feature
+request will make it easier for us to come back to you quickly and with good
+feedback.
+### Do you want to implement a new metric?
+All evaluation modules, be it metrics, comparisons, or measurements live on the 🤗 Hub in a [Space](https://huggingface.co/docs/hub/spaces) (see for example [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)). Evaluation modules can be either **community** or **canonical**.
+* **Canonical** metrics are well-established metrics which already broadly adopted.
+* **Community** metrics are new or custom metrics. It is simple to add a new community metric to use with `evaluate`. Please see our guide to adding a new evaluation metric [here](https://huggingface.co/docs/evaluate/creating_and_sharing)!
+The only functional difference is that canonical metrics are integrated into the `evaluate` library directly and do not require a namespace when being loaded.
+We encourage contributors to share new evaluation modules they contribute broadly! If they become widely adopted then they will be integrated into the core `evaluate` library as a canonical module.
+### Do you want to request a new feature (that is not a metric)?
+We would appreciate it if your feature request addresses the following points:
+1. Motivation first:
+  * Is it related to a problem/frustration with the library? If so, please explain
+    why. Providing a code snippet that demonstrates the problem is best.
+  * Is it related to something you would need for a project? We'd love to hear
+    about it!
+  * Is it something you worked on and think could benefit the community?
+    Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+### Did you find a bug?
+Thank you for reporting an issue. If the bug is related to a community metric, please open an issue or pull request directly on the repository of the metric on the Hugging Face Hub.
+If the bug is related to the `evaluate` library and not a community metric, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on Github under Issues). If it's not already logged, please open an issue with these details:
+* Include your **OS type and version**, the versions of **Python**, **PyTorch** and
+  **Tensorflow** when applicable;
+* A short, self-contained, code snippet that allows us to reproduce the bug in
+  less than 30s;
+* Provide the *full* traceback if an exception is raised.
+## Start contributing! (Pull Requests)
+Before writing code, we strongly advise you to search through the existing PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+1. Fork the [repository](https://github.com/huggingface/evaluate) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+2. Clone your fork to your local disk, and add the base repository as a remote:
+   ```bash
+   $ git clone git@github.com:<your Github handle>/evaluate.git
+   $ cd evaluate
+   $ git remote add upstream https://github.com/huggingface/evaluate.git
+   ```
+3. Create a new branch to hold your development changes:
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+   **Do not** work on the `main` branch.
+4. Set up a development environment by running the following command in a virtual environment:
+   ```bash
+   $ pip install -e ".[dev]"
+   ```
+5. Develop the features on your branch.
+   As you work on the features, you should make sure that the test suite
+   passes. You should run the tests impacted by your changes like this:
+   ```bash
+   $ pytest tests/<TEST_TO_RUN>.py
+   ```
+   To run a specific test, for example the `test_model_init` test in test_evaluator.py,
+   ```bash
+   python -m pytest ./tests/test_evaluator.py::TestQuestionAnsweringEvaluator::test_model_init
+   ```
+   You can also run the full suite with the following command:
+   ```bash
+   $ python -m pytest ./tests/
+   ```
+   🤗 Evaluate relies on `black` and `isort` to format its source code
+   consistently. After you make changes, apply automatic style corrections and code verifications
+   that can't be automated in one go with:
+   ```bash
+   $ make fixup
+   ```
+   This target is also optimized to only work with files modified by the PR you're working on.
+   If you prefer to run the checks one after the other, the following command apply the
+   style corrections:
+   ```bash
+   $ make style
+   ```
+   🤗 Evaluate also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   control runs in CI, however you can also run the same checks with:
+   ```bash
+   $ make quality
+   ```
+   If you're modifying documents under `docs/source`, make sure to validate that
+   they can still be built. This check also runs in CI. To run a local check
+   make sure you have installed the documentation builder requirements. First you will need to clone the
+   repository containing our tools to build the documentation:
+   ```bash
+   $ pip install git+https://github.com/huggingface/doc-builder
+   ```
+   Then, make sure you have all the dependencies to be able to build the doc with:
+   ```bash
+   $ pip install ".[docs]"
+   ```
+   Finally, run the following command from the root of the repository:
+   ```bash
+   $ doc-builder build evaluate docs/source/ --build_dir ~/tmp/test-build
+   ```
+   This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
+   Markdown files with your favorite editor. You won't be able to see the final rendering on the website
+   before your PR is merged, we are actively working on adding a tool for this.
+   Once you're happy with your changes, add changed files using `git add` and
+   make a commit with `git commit` to record your changes locally:
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+   Please write [good commit
+   messages](https://chris.beams.io/posts/git-commit/).
+   It is a good idea to sync your copy of the code with the original
+   repository regularly. This way you can quickly account for changes:
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/main
+   ```
+   Push the changes to your account using:
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+6. Once you are satisfied, go to the webpage of your fork on GitHub. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+### Checklist
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request addresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`. These
+   are useful to avoid duplicated work, and to differentiate it from PRs ready
+   to be merged;
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality testing = no merge.
+6. All public methods must have informative docstrings that work nicely with sphinx.
+7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+   the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
+   them by URL.
+### Style guide
+For documentation strings, 🤗 Evaluate follows the [google style](https://google.github.io/styleguide/pyguide.html).
+Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
+for more information.
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
+### Develop on Windows
+On Windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
+`git config core.autocrlf input`
+One way one can run the make command on Window is to pass by MSYS2:
+1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
+2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
+3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
+4. Add `C:\msys64\usr\bin` to your PATH environment variable.
+You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
+### Syncing forked main with upstream (HuggingFace) main
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
+when syncing the main branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream main
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```

ST/evaluate/LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ST/evaluate/Makefile ADDED Viewed

	@@ -0,0 +1,19 @@

+.PHONY: quality style test
+# Check that source code meets quality standards
+quality:
+	black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
+	isort --check-only tests src metrics measurements
+	flake8 tests src metrics
+# Format source code automatically
+style:
+	black --line-length 119 --target-version py36 tests src metrics comparisons measurements
+	isort tests src metrics measurements
+# Run tests for the library
+test:
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/

ST/evaluate/README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/evaluate-banner.png" width="400"/>
+    <br>
+</p>
+<p align="center">
+    <a href="https://github.com/huggingface/evaluate/actions/workflows/ci.yml?query=branch%3Amain">
+        <img alt="Build" src="https://github.com/huggingface/evaluate/actions/workflows/ci.yml/badge.svg?branch=main">
+    </a>
+    <a href="https://github.com/huggingface/evaluate/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/evaluate.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/docs/evaluate/index">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/evaluate/index.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/evaluate/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/evaluate.svg">
+    </a>
+    <a href="CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg">
+    </a>
+</p>
+🤗 Evaluate is a library that makes evaluating and comparing models and reporting their performance easier and more standardized.
+It currently contains:
+- **implementations of dozens of popular metrics**: the existing metrics cover a variety of tasks spanning from NLP to Computer Vision, and include dataset-specific metrics for datasets. With a simple command like `accuracy = load("accuracy")`, get any of these metrics ready to use for evaluating a ML model in any framework (Numpy/Pandas/PyTorch/TensorFlow/JAX).
+- **comparisons and measurements**: comparisons are used to measure the difference between models and measurements are tools to evaluate datasets.
+- **an easy way of adding new evaluation modules to the 🤗 Hub**: you can create new evaluation modules and push them to a dedicated Space in the 🤗 Hub with `evaluate-cli create [metric name]`, which allows you to see easily compare different metrics and their outputs for the same sets of references and predictions.
+[🎓 **Documentation**](https://huggingface.co/docs/evaluate/)
+🔎 **Find a [metric](https://huggingface.co/evaluate-metric), [comparison](https://huggingface.co/evaluate-comparison), [measurement](https://huggingface.co/evaluate-measurement) on the Hub**
+[🌟 **Add a new evaluation module**](https://huggingface.co/docs/evaluate/)
+🤗 Evaluate also has lots of useful features like:
+- **Type checking**: the input types are checked to make sure that you are using the right input formats for each metric
+- **Metric cards**: each metrics comes with a card that describes the values, limitations and their ranges, as well as providing examples of their usage and usefulness.
+- **Community metrics:** Metrics live on the Hugging Face Hub and you can easily add your own metrics for your project or to collaborate with others.
+# Installation
+## With pip
+🤗 Evaluate can be installed from PyPi and has to be installed in a virtual environment (venv or conda for instance)
+```bash
+pip install evaluate
+```
+# Usage
+🤗 Evaluate's main methods are:
+- `evaluate.list_evaluation_modules()` to list the available metrics, comparisons and measurements
+- `evaluate.load(module_name, **kwargs)` to instantiate an evaluation module
+- `results = module.compute(*kwargs)` to compute the result of an evaluation module
+# Adding a new evaluation module
+First install the necessary dependencies to create a new metric with the following command:
+```bash
+pip install evaluate[template]
+```
+Then you can get started with the following command which will create a new folder for your metric and display the necessary steps:
+```bash
+evaluate-cli create "Awesome Metric"
+```
+See this [step-by-step guide](https://huggingface.co/docs/evaluate/creating_and_sharing) in the documentation for detailed instructions.
+## Credits
+Thanks to [@marella](https://github.com/marella) for letting us use the `evaluate` namespace on PyPi previously used by his [library](https://github.com/marella/evaluate).

ST/evaluate/additional-tests-requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+unbabel-comet>=1.0.0;python_version>'3.6'
+git+https://github.com/google-research/bleurt.git
+git+https://github.com/ns-moosavi/coval.git
+git+https://github.com/hendrycks/math.git
+git+https://github.com/google-research/rl-reliability-metrics
+gin-config

ST/evaluate/comparisons/exact_match/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+title: Exact Match
+emoji: 🤗
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- comparison
+description: >-
+  Returns the rate at which the predictions of one model exactly match those of another model.
+---
+# Comparison Card for Exact Match
+## Comparison description
+ Given two model predictions the exact match score is 1 if they are the exact same, and is 0 otherwise. The overall exact match score is the average.
+- **Example 1**: The exact match score if prediction 1.0 is [0, 1] is 0, given prediction 2 is [0, 1].
+- **Example 2**: The exact match score if prediction 0.0 is [0, 1] is 0, given prediction 2 is [1, 0].
+- **Example 3**: The exact match score if prediction 0.5 is [0, 1] is 0, given prediction 2 is [1, 1].
+## How to use
+At minimum, this metric takes as input predictions and references:
+```python
+>>> exact_match = evaluate.load("exact_match", module_type="comparison")
+>>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1])
+>>> print(results)
+{'exact_match': 0.66}
+```
+## Output values
+Returns a float between 0.0 and 1.0 inclusive.
+## Examples
+```python
+>>> exact_match = evaluate.load("exact_match", module_type="comparison")
+>>> results = exact_match.compute(predictions1=[0, 0, 0], predictions2=[1, 1, 1])
+>>> print(results)
+{'exact_match': 1.0}
+```
+```python
+>>> exact_match = evaluate.load("exact_match", module_type="comparison")
+>>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1])
+>>> print(results)
+{'exact_match': 0.66}
+```
+## Limitations and bias
+## Citations

ST/evaluate/comparisons/exact_match/app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("exact_match", module_type="comparison")
+launch_gradio_widget(module)

ST/evaluate/comparisons/exact_match/exact_match.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2022 The HuggingFace Evaluate Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exact match test for model comparison."""
+import datasets
+import numpy as np
+import evaluate
+_DESCRIPTION = """
+Returns the rate at which the predictions of one model exactly match those of another model.
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions1 (`list` of `int`): Predicted labels for model 1.
+    predictions2 (`list` of `int`): Predicted labels for model 2.
+Returns:
+    exact_match (`float`): Dictionary containing exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
+Examples:
+    >>> exact_match = evaluate.load("exact_match", module_type="comparison")
+    >>> results = exact_match.compute(predictions1=[1, 1, 1], predictions2=[1, 1, 1])
+    >>> print(results)
+    {'exact_match': 1.0}
+"""
+_CITATION = """
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ExactMatch(evaluate.Comparison):
+    def _info(self):
+        return evaluate.ComparisonInfo(
+            module_type="comparison",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions1": datasets.Value("int64"),
+                    "predictions2": datasets.Value("int64"),
+                }
+            ),
+        )
+    def _compute(self, predictions1, predictions2):
+        score_list = [p1 == p2 for p1, p2 in zip(predictions1, predictions2)]
+        return {"exact_match": np.mean(score_list)}

ST/evaluate/comparisons/exact_match/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2	+ scipy

ST/evaluate/comparisons/mcnemar/README.md ADDED Viewed

	@@ -0,0 +1,86 @@

+---
+title: McNemar
+emoji: 🤗
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- comparison
+description: >-
+  McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
+  McNemar = (SE - SP)**2 / SE + SP
+  Where:
+  SE: Sensitivity (Test 1 positive; Test 2 negative)
+  SP: Specificity (Test 1 negative; Test 2 positive)
+---
+# Comparison Card for McNemar
+## Comparison description
+McNemar's test is a non-parametric diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
+McNemar = (SE - SP)**2 / SE + SP
+Where:
+* SE: Sensitivity (Test 1 positive; Test 2 negative)
+* SP: Specificity (Test 1 negative; Test 2 positive)
+In other words, SE and SP are the diagonal elements of the contingency table for the classifier predictions (`predictions1` and `predictions2`) with respect to the ground truth `references`.
+## How to use
+The McNemar comparison calculates the proportions of responses that exhibit disagreement between two classifiers. It is used to analyze paired nominal data.
+## Inputs
+Its arguments are:
+`predictions1`: a list of predictions from the first model.
+`predictions2`: a list of predictions from the second model.
+`references`: a list of the ground truth reference labels.
+## Output values
+The McNemar comparison outputs two things:
+`stat`: The McNemar statistic.
+`p`: The p value.
+## Examples
+Example comparison:
+```python
+mcnemar = evaluate.load("mcnemar")
+results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1])
+print(results)
+{'stat': 1.0, 'p': 0.31731050786291115}
+```
+## Limitations and bias
+The McNemar test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired nominal data only.
+## Citations
+```bibtex
+@article{mcnemar1947note,
+  title={Note on the sampling error of the difference between correlated proportions or percentages},
+  author={McNemar, Quinn},
+  journal={Psychometrika},
+  volume={12},
+  number={2},
+  pages={153--157},
+  year={1947},
+  publisher={Springer-Verlag}
+}
+```

ST/evaluate/comparisons/mcnemar/app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("mcnemar", module_type="comparison")
+launch_gradio_widget(module)

ST/evaluate/comparisons/mcnemar/mcnemar.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2022 The HuggingFace Evaluate Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""McNemar test for model comparison."""
+import datasets
+from scipy.stats import chi2
+import evaluate
+_DESCRIPTION = """
+McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
+McNemar = (SE - SP)**2 / SE + SP
+ Where:
+SE: Sensitivity (Test 1 positive; Test 2 negative)
+SP: Specificity (Test 1 negative; Test 2 positive)
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions1 (`list` of `int`): Predicted labels for model 1.
+    predictions2 (`list` of `int`): Predicted labels for model 2.
+    references (`list` of `int`): Ground truth labels.
+Returns:
+    stat (`float`): McNemar test score.
+    p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference.
+Examples:
+    >>> mcnemar = evaluate.load("mcnemar")
+    >>> results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1])
+    >>> print(results)
+    {'stat': 1.0, 'p': 0.31731050786291115}
+"""
+_CITATION = """
+@article{mcnemar1947note,
+  title={Note on the sampling error of the difference between correlated proportions or percentages},
+  author={McNemar, Quinn},
+  journal={Psychometrika},
+  volume={12},
+  number={2},
+  pages={153--157},
+  year={1947},
+  publisher={Springer-Verlag}
+}
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class McNemar(evaluate.Comparison):
+    def _info(self):
+        return evaluate.ComparisonInfo(
+            module_type="comparison",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions1": datasets.Value("int64"),
+                    "predictions2": datasets.Value("int64"),
+                    "references": datasets.Value("int64"),
+                }
+            ),
+        )
+    def _compute(self, predictions1, predictions2, references):
+        # construct contingency table
+        tbl = [[0, 0], [0, 0]]
+        for gt, p1, p2 in zip(references, predictions1, predictions2):
+            if p1 == gt and p2 == gt:
+                tbl[0][0] += 1
+            elif p1 == gt:
+                tbl[0][1] += 1
+            elif p2 == gt:
+                tbl[1][0] += 1
+            else:
+                tbl[1][1] += 1
+        # compute statistic
+        b, c = tbl[0][1], tbl[1][0]
+        statistic = abs(b - c) ** 2 / (1.0 * (b + c))
+        df = 1
+        pvalue = chi2.sf(statistic, df)
+        return {"stat": statistic, "p": pvalue}

ST/evaluate/comparisons/mcnemar/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2	+ scipy

ST/evaluate/comparisons/wilcoxon/README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+---
+title: Wilcoxon
+emoji: 🤗
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- comparison
+description: >-
+  Wilcoxon's test is a signed-rank test for comparing paired samples.
+---
+# Comparison Card for Wilcoxon
+## Comparison description
+Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
+## How to use
+The Wilcoxon comparison is used to analyze paired ordinal data.
+## Inputs
+Its arguments are:
+`predictions1`: a list of predictions from the first model.
+`predictions2`: a list of predictions from the second model.
+## Output values
+The Wilcoxon comparison outputs two things:
+`stat`: The Wilcoxon statistic.
+`p`: The p value.
+## Examples
+Example comparison:
+```python
+wilcoxon = evaluate.load("wilcoxon")
+results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
+print(results)
+{'stat': 5.0, 'p': 0.625}
+```
+## Limitations and bias
+The Wilcoxon test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired ordinal data only.
+## Citations
+```bibtex
+@incollection{wilcoxon1992individual,
+  title={Individual comparisons by ranking methods},
+  author={Wilcoxon, Frank},
+  booktitle={Breakthroughs in statistics},
+  pages={196--202},
+  year={1992},
+  publisher={Springer}
+}
+```

ST/evaluate/comparisons/wilcoxon/app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("wilcoxon", module_type="comparison")
+launch_gradio_widget(module)

ST/evaluate/comparisons/wilcoxon/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/huggingface/evaluate@a45df1eb9996eec64ec3282ebe554061cb366388
+datasets~=2.0
+scipy

ST/evaluate/comparisons/wilcoxon/wilcoxon.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright 2022 The HuggingFace Evaluate Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wilcoxon test for model comparison."""
+import datasets
+from scipy.stats import wilcoxon
+import evaluate
+_DESCRIPTION = """
+Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions1 (`list` of `float`): Predictions for model 1.
+    predictions2 (`list` of `float`): Predictions for model 2.
+Returns:
+    stat (`float`): Wilcoxon test score.
+    p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference.
+Examples:
+    >>> wilcoxon = evaluate.load("wilcoxon")
+    >>> results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
+    >>> print(results)
+    {'stat': 5.0, 'p': 0.625}
+"""
+_CITATION = """
+@incollection{wilcoxon1992individual,
+  title={Individual comparisons by ranking methods},
+  author={Wilcoxon, Frank},
+  booktitle={Breakthroughs in statistics},
+  pages={196--202},
+  year={1992},
+  publisher={Springer}
+}
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Wilcoxon(evaluate.Comparison):
+    def _info(self):
+        return evaluate.ComparisonInfo(
+            module_type="comparison",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions1": datasets.Value("float"),
+                    "predictions2": datasets.Value("float"),
+                }
+            ),
+        )
+    def _compute(self, predictions1, predictions2):
+        # calculate difference
+        d = [p1 - p2 for (p1, p2) in zip(predictions1, predictions2)]
+        # compute statistic
+        res = wilcoxon(d)
+        return {"stat": res.statistic, "p": res.pvalue}

ST/evaluate/docs/README.md ADDED Viewed

	@@ -0,0 +1,285 @@

+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Generating the documentation
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
+you can install them with the following command, at the root of the code repository:
+```bash
+pip install -e ".[docs]"
+```
+Then you need to install our special tool that builds the documentation:
+```bash
+pip install git+https://github.com/huggingface/doc-builder
+```
+---
+**NOTE**
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+check how they look like before committing for instance). You don't have to commit the built documentation.
+---
+## Building the documentation
+Once you have setup the `doc-builder` and additional packages, you can generate the documentation by typing th
+following command:
+```bash
+doc-builder build transformers docs/source/ --build_dir ~/tmp/test-build
+```
+You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
+the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
+Markdown editor.
+---
+**NOTE**
+It's not possible to see locally how the final documentation will look like for now. Once you have opened a PR, you
+will see a bot add a comment to a link where the documentation with your changes lives.
+---
+## Adding a new element to the navigation bar
+Accepted files are Markdown (.md or .mdx).
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/master/docs/source/_toctree.yml) file.
+## Renaming section headers and moving sections
+It helps to keep the old links working when renaming section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums and Social media and it'd be make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
+Therefore we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
+So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
+```
+Sections that were moved:
+[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
+```
+and of course if you moved it to another file, then:
+```
+Sections that were moved:
+[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
+```
+Use the relative style to link to the new file so that the versioned docs continue to work.
+For an example of a rich moved sections set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/master/docs/source/main_classes/trainer.mdx).
+## Writing Documentation - Specification
+The `huggingface/transformers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
+although we can write them directly in Markdown.
+### Adding a new tutorial
+Adding a new tutorial or section is done in two steps:
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/_toctree.yml` on the correct toc-tree.
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
+four.
+### Adding a new model
+When adding a new model:
+- Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
+- Link that file in `./source/_toctree.yml`.
+- Write a short overview of the model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
+  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
+  The order is generally:
+    - Configuration,
+    - Tokenizer
+    - PyTorch base model
+    - PyTorch head models
+    - TensorFlow base model
+    - TensorFlow head models
+    - Flax base model
+    - Flax head models
+These classes should be added using our Markdown syntax. Usually as follows:
+```
+## XXXConfig
+[[autodoc]] XXXConfig
+```
+This will include every public method of the configuration that is documented. If for some reason you wish for a method
+not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
+```
+## XXXTokenizer
+[[autodoc]] XXXTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+```
+If you just want to add a method that is not documented (for instance magic method like `__call__` are not documented
+byt default) you can put the list of methods to add in a list that contains `all`:
+```
+## XXXTokenizer
+[[autodoc]] XXXTokenizer
+    - all
+    - __call__
+```
+### Writing source documentation
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
+and objects like True, None or any strings should usually be put in `code`.
+When mentioning a class, function or method, it is recommended to use our syntax for internal links so that our tool
+adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or
+function to be in the main package.
+If you want to create a link to some internal class or function, you need to
+provide its path. For instance: \[\`file_utils.ModelOutput\`\]. This will be converted into a link with
+`file_utils.ModelOutput` in the description. To get rid of the path and only keep the name of the object you are
+linking to in the description, add a ~: \[\`~file_utils.ModelOutput\`\] will generate a link with `ModelOutput` in the description.
+The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].
+#### Defining arguments in a method
+Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
+an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon and its
+description:
+```
+    Args:
+        n_layers (`int`): The number of layers of the model.
+```
+If the description is too long to fit in one line, another indentation is necessary before writing the description
+after th argument.
+Here's an example showcasing everything so far:
+```
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+            [`~PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+```
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+```
+def my_function(x: str = None, a: float = 1):
+```
+then its documentation should look like this:
+```
+    Args:
+        x (`str`, *optional*):
+            This argument controls ...
+        a (`float`, *optional*, defaults to 1):
+            This argument is used to ...
+```
+Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however write as many lines as you want in the indented description (see the example above with `input_ids`).
+#### Writing a multi-line code block
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
+````
+```
+# first line of code
+# second line
+# etc
+```
+````
+We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
+the results stay consistent with the library.
+#### Writing a return block
+The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+Here's an example for a single value return:
+```
+    Returns:
+        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+```
+Here's an example for tuple return, comprising several objects:
+```
+    Returns:
+        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
+          Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+#### Adding an image
+Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
+them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.
+## Styling the docstring
+We have an automatic script running with the `make style` comment that will make sure that:
+- the docstrings fully take advantage of the line width
+- all code examples are formatted using black, like the code of the Transformers library
+This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
+recommended to commit your changes before running `make style`, so you can revert the changes done by that script
+easily.

ST/evaluate/docs/source/_toctree.yml ADDED Viewed

	@@ -0,0 +1,52 @@

+- sections:
+  - local: index
+    title: 🤗 Evaluate
+  title: Get started
+- sections:
+  - local: installation
+    title: Installation
+  - local: a_quick_tour
+    title: A quick tour
+  title: Tutorials
+- sections:
+  - local: choosing_a_metric
+    title: Choosing the right metric
+  - local: creating_and_sharing
+    title: Adding new evaluations
+  - local: base_evaluator
+    title: Using the evaluator
+  - local: custom_evaluator
+    title: Using the evaluator with custom pipelines
+  - local: evaluation_suite
+    title: Creating an EvaluationSuite
+  - sections:
+    - local: transformers_integrations
+      title: Transformers
+    - local: keras_integrations
+      title: Keras and Tensorflow
+    - local: sklearn_integrations
+      title: scikit-learn
+    title: Using 🤗 Evaluate with other ML frameworks
+  title: "How-to guides"
+- sections:
+    - local: types_of_evaluations
+      title: Types of evaluations
+    - local: considerations
+      title: Considerations for model evaluation
+  title: "Conceptual guides"
+- sections:
+  - local: package_reference/main_classes
+    title: Main classes
+  - local: package_reference/loading_methods
+    title: Loading methods
+  - local: package_reference/saving_methods
+    title: Saving methods
+  - local: package_reference/hub_methods
+    title: Hub methods
+  - local: package_reference/evaluator_classes
+    title: Evaluator classes
+  - local: package_reference/visualization_methods
+    title: Visualization methods
+  - local: package_reference/logging_methods
+    title: Logging methods
+  title: "Reference"

ST/evaluate/docs/source/a_quick_tour.mdx ADDED Viewed

	@@ -0,0 +1,380 @@

+# A quick tour
+🤗 Evaluate provides access to a wide range of evaluation tools. It covers a range of modalities such as text, computer vision, audio, etc. as well as tools to evaluate models or datasets. These tools are split into three categories.
+## Types of evaluations
+There are different aspects of a typical machine learning pipeline that can be evaluated and for each aspect 🤗 Evaluate provides a tool:
+- **Metric**: A metric is used to evaluate a model's performance and usually involves the model's predictions as well as some ground truth labels. You can find all integrated metrics at [evaluate-metric](https://huggingface.co/evaluate-metric).
+- **Comparison**: A comparison is used to compare two models. This can for example be done by comparing their predictions to ground truth labels and computing their agreement. You can find all integrated comparisons at [evaluate-comparison](https://huggingface.co/evaluate-comparison).
+- **Measurement**: The dataset is as important as the model trained on it. With measurements one can investigate a dataset's properties. You can find all integrated measurements at [evaluate-measurement](https://huggingface.co/evaluate-measurement).
+Each of these evaluation modules live on Hugging Face Hub as a Space. They come with an interactive widget and a documentation card documenting its use and limitations. For example [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy):
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/metric-widget.png" width="400"/>
+</div>
+Each metric, comparison, and measurement is a separate Python module, but for using any of them, there is a single entry point: [`evaluate.load`]!
+## Load
+Any metric, comparison, or measurement is loaded with the `evaluate.load` function:
+```py
+>>> import evaluate
+>>> accuracy = evaluate.load("accuracy")
+```
+If you want to make sure you are loading the right type of evaluation (especially if there are name clashes) you can explicitly pass the type:
+```py
+>>> word_length = evaluate.load("word_length", module_type="measurement")
+```
+### Community modules
+Besides the modules implemented in 🤗 Evaluate you can also load any community module by specifying the repository ID of the metric implementation:
+```py
+>>> element_count = evaluate.load("lvwerra/element_count", module_type="measurement")
+```
+See the [Creating and Sharing Guide](/docs/evaluate/main/en/creating_and_sharing) for information about uploading custom metrics.
+### List available modules
+With [`list_evaluation_modules`] you can check what modules are available on the hub. You can also filter for a specific modules and skip community metrics if you want. You can also see additional information such as likes:
+```python
+>>> evaluate.list_evaluation_modules(
+...   module_type="comparison",
+...   include_community=False,
+...   with_details=True)
+[{'name': 'mcnemar', 'type': 'comparison', 'community': False, 'likes': 1},
+ {'name': 'exact_match', 'type': 'comparison', 'community': False, 'likes': 0}]
+```
+## Module attributes
+All evalution modules come with a range of useful attributes that help to use a module stored in a [`EvaluationModuleInfo`] object.
+|Attribute|Description|
+|---|---|
+|`description`|A short description of the evaluation module.|
+|`citation`|A BibTex string for citation when available.|
+|`features`|A `Features` object defining the input format.|
+|`inputs_description`|This is equivalent to the modules docstring.|
+|`homepage`|The homepage of the module.|
+|`license`|The license of the module.|
+|`codebase_urls`|Link to the code behind the module.|
+|`reference_urls`|Additional reference URLs.|
+Let's have a look at a few examples. First, let's look at the `description` attribute of the accuracy metric:
+```py
+>>> accuracy = evaluate.load("accuracy")
+>>> accuracy.description
+Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
+Accuracy = (TP + TN) / (TP + TN + FP + FN)
+ Where:
+TP: True positive
+TN: True negative
+FP: False positive
+FN: False negative
+```
+You can see that it describes how the metric works in theory. If you use this metric for your work, especially if it is an academic publication you want to reference it properly. For that you can look at the `citation` attribute:
+```py
+>>> accuracy.citation
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+```
+Before we can apply a metric or other evaluation module to a use-case, we need to know what the input format of the metric is:
+```py
+>>> accuracy.features
+{
+    'predictions': Value(dtype='int32', id=None),
+    'references': Value(dtype='int32', id=None)
+}
+```
+<Tip>
+Note that features always describe the type of a single input element. In general we will add lists of elements so you can always think of a list around the types in `features`. Evaluate accepts various input formats (Python lists, NumPy arrays, PyTorch tensors, etc.) and converts them to an appropriate format for storage and computation.
+</Tip>
+## Compute
+Now that we know how the evaluation module works and what should go in there we want to actually use it! When it comes to computing the actual score there are two main ways to do it:
+1. All-in-one
+2. Incremental
+In the incremental approach the necessary inputs are added to the module with [`EvaluationModule.add`] or [`EvaluationModule.add_batch`] and the score is calculated at the end with [`EvaluationModule.compute`]. Alternatively, one can pass all the inputs at once to `compute()`. Let's have a look at the two approaches.
+### How to compute
+The simplest way to calculate the score of an evaluation module is by calling `compute()` directly with the necessary inputs. Simply pass the inputs as seen in `features` to the `compute()` method.
+```py
+>>> accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])
+{'accuracy': 0.5}
+```
+Evaluation modules return the results in a dictionary. However, in some instances you build up the predictions iteratively or in a distributed fashion in which case `add()` or `add_batch()` are useful.
+### Calculate a single metric or a batch of metrics
+In many evaluation pipelines you build the predictions iteratively such as in a for-loop. In that case you could store the predictions in a list and at the end pass them to `compute()`. With `add()` and `add_batch()` you can circumvent the step of storing the predictions separately. If you are only creating single predictions at a time you can use `add()`:
+```py
+>>> for ref, pred in zip([0,1,0,1], [1,0,0,1]):
+>>>     accuracy.add(references=ref, predictions=pred)
+>>> accuracy.compute()
+{'accuracy': 0.5}
+```
+Once you have gathered all predictions you can call `compute()` to compute the score based on all stored values. When getting predictions and references in batches you can use `add_batch()` which adds a list elements for later processing. The rest works as with `add()`:
+```py
+>>> for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
+>>>     accuracy.add_batch(references=refs, predictions=preds)
+>>> accuracy.compute()
+{'accuracy': 0.5}
+```
+This is especially useful when you need to get the predictions from your model in batches:
+```py
+>>> for model_inputs, gold_standards in evaluation_dataset:
+>>>     predictions = model(model_inputs)
+>>>     metric.add_batch(references=gold_standards, predictions=predictions)
+>>> metric.compute()
+```
+### Distributed evaluation
+Computing metrics in a distributed environment can be tricky. Metric evaluation is executed in separate Python processes, or nodes, on different subsets of a dataset. Typically, when a metric score is additive (`f(AuB) = f(A) + f(B)`), you can use distributed reduce operations to gather the scores for each subset of the dataset. But when a metric is non-additive (`f(AuB) ≠ f(A) + f(B)`), it's not that simple. For example, you can't take the sum of the [F1](https://huggingface.co/spaces/evaluate-metric/f1) scores of each data subset as your **final metric**.
+A common way to overcome this issue is to fallback on single process evaluation. The metrics are evaluated on a single GPU, which becomes inefficient.
+🤗 Evaluate solves this issue by only computing the final metric on the first node. The predictions and references are computed and provided to the metric separately for each node. These are temporarily stored in an Apache Arrow table, avoiding cluttering the GPU or CPU memory. When you are ready to `compute()` the final metric, the first node is able to access the predictions and references stored on all the other nodes. Once it has gathered all the predictions and references, `compute()` will perform the final metric evaluation.
+This solution allows 🤗 Evaluate to perform distributed predictions, which is important for evaluation speed in distributed settings. At the same time, you can also use complex non-additive metrics without wasting valuable GPU or CPU memory.
+## Combining several evaluations
+Often one wants to not only evaluate a single metric but a range of different metrics capturing different aspects of a model. E.g. for classification it is usually a good idea to compute F1-score, recall, and precision in addition to accuracy to get a better picture of model performance. Naturally, you can load a bunch of metrics and call them sequentially. However, a more convenient way is to use the [`~evaluate.combine`] function to bundle them together:
+```python
+>>> clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
+```
+The `combine` function accepts both the list of names of the metrics as well as an instantiated modules. The `compute` call then computes each metric:
+```python
+>>> clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])
+{
+  'accuracy': 0.667,
+  'f1': 0.667,
+  'precision': 1.0,
+  'recall': 0.5
+}
+```
+## Save and push to the Hub
+Saving and sharing evaluation results is an important step. We provide the [`evaluate.save`] function to easily save metrics results. You can either pass a specific filename or a directory. In the latter case, the results are saved in a file with an automatically created file name. Besides the directory or file name, the function takes any key-value pairs as inputs and stores them in a JSON file.
+```py
+>>> result = accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])
+>>> hyperparams = {"model": "bert-base-uncased"}
+>>> evaluate.save("./results/", experiment="run 42", **result, **hyperparams)
+PosixPath('results/result-2022_05_30-22_09_11.json')
+```
+The content of the JSON file look like the following:
+```json
+{
+    "experiment": "run 42",
+    "accuracy": 0.5,
+    "model": "bert-base-uncased",
+    "_timestamp": "2022-05-30T22:09:11.959469",
+    "_git_commit_hash": "123456789abcdefghijkl",
+    "_evaluate_version": "0.1.0",
+    "_python_version": "3.9.12 (main, Mar 26 2022, 15:51:15) \n[Clang 13.1.6 (clang-1316.0.21.2)]",
+    "_interpreter_path": "/Users/leandro/git/evaluate/env/bin/python"
+}
+```
+In addition to the specified fields, it also contains useful system information for reproducing the results.
+Besides storing the results locally, you should report them on the model's repository on the Hub. With the [`evaluate.push_to_hub`] function, you can easily report evaluation results to the model's repository:
+```py
+evaluate.push_to_hub(
+  model_id="huggingface/gpt2-wikitext2",  # model repository on hub
+  metric_value=0.5,                       # metric value
+  metric_type="bleu",                     # metric name, e.g. accuracy.name
+  metric_name="BLEU",                     # pretty name which is displayed
+  dataset_type="wikitext",                # dataset name on the hub
+  dataset_name="WikiText",                # pretty name
+  dataset_split="test",                   # dataset split used
+  task_type="text-generation",            # task id, see https://github.com/huggingface/evaluate/blob/main/src/evaluate/config.py#L154-L192
+  task_name="Text Generation"             # pretty name for task
+)
+```
+## Evaluator
+The [`evaluate.evaluator`] provides automated evaluation and only requires a model, dataset, metric in contrast to the metrics in `EvaluationModule`s that require the model's predictions. As such it is easier to evaluate a model on a dataset with a given metric as the inference is handled internally. To make that possible it uses the [`~transformers.pipeline`] abstraction from `transformers`. However, you can use your own framework as long as it follows the `pipeline` interface.
+To make an evaluation with the `evaluator` let's load a `transformers` pipeline (but you can pass your own custom inference class for any framework as long as it follows the pipeline call API) with an model trained on IMDb, the IMDb test split and the accuracy metric.
+```python
+from transformers import pipeline
+from datasets import load_dataset
+from evaluate import evaluator
+import evaluate
+pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb", device=0)
+data = load_dataset("imdb", split="test").shuffle().select(range(1000))
+metric = evaluate.load("accuracy")
+```
+Then you can create an evaluator for text classification and pass the three objects to the `compute()` method. With the label mapping `evaluate` provides a method to align the pipeline outputs with the label column in the dataset:
+```python
+>>> task_evaluator = evaluator("text-classification")
+>>> results = task_evaluator.compute(model_or_pipeline=pipe, data=data, metric=metric,
+...                        label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
+>>> print(results)
+{'accuracy': 0.934}
+```
+Calculating the value of the metric alone is often not enough to know if a model performs significantly better than another one. With _bootstrapping_ `evaluate` computes confidence intervals and the standard error which helps estimate how stable a score is:
+```python
+>>> results = eval.compute(model_or_pipeline=pipe, data=data, metric=metric,
+...                        label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
+...                        strategy="bootstrap", n_resamples=200)
+>>> print(results)
+{'accuracy':
+    {
+      'confidence_interval': (0.906, 0.9406749892841922),
+      'standard_error': 0.00865213251082787,
+      'score': 0.923
+    }
+}
+```
+The evaluator expects a `"text"` and `"label"` column for the data input. If your dataset differs you can provide the columns with the keywords `input_column="text"` and `label_column="label"`. Currently only `"text-classification"` is supported with more tasks being added in the future.
+## Visualization
+When comparing several models, sometimes it's hard to spot the differences in their performance simply by looking at their scores. Also often there is not a single best model but there are trade-offs between e.g. latency and accuracy as larger models might have better performance but are also slower. We are gradually adding different visualization approaches, such as plots, to make choosing the best model for a use-case easier.
+For instance, if you have a list of results from multiple models (as dictionaries), you can feed them into the `radar_plot()` function:
+```python
+import evaluate
+from evaluate.visualization import radar_plot
+>>> data = [
+   {"accuracy": 0.99, "precision": 0.8, "f1": 0.95, "latency_in_seconds": 33.6},
+   {"accuracy": 0.98, "precision": 0.87, "f1": 0.91, "latency_in_seconds": 11.2},
+   {"accuracy": 0.98, "precision": 0.78, "f1": 0.88, "latency_in_seconds": 87.6},
+   {"accuracy": 0.88, "precision": 0.78, "f1": 0.81, "latency_in_seconds": 101.6}
+   ]
+>>> model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]
+>>> plot = radar_plot(data=data, model_names=model_names)
+>>> plot.show()
+```
+Which lets you visually compare the 4 models and choose the optimal one for you, based on one or several metrics:
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/example_viz.png" width="400"/>
+</div>
+## Running evaluation on a suite of tasks
+It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. The [EvaluationSuite](evaluation_suite) enables evaluation of models on a collection of tasks. Tasks can be constructed as ([evaluator](base_evaluator), dataset, metric) tuples and passed to an [EvaluationSuite](evaluation_suite) stored on the Hugging Face Hub as a Space, or locally as a Python script. See the [evaluator documentation](base_evaluator) for a list of currently supported tasks.
+`EvaluationSuite` scripts can be defined as follows, and supports Python code for data preprocessing.
+```python
+import evaluate
+from evaluate.evaluation_suite import SubTask
+class Suite(evaluate.EvaluationSuite):
+    def __init__(self, name):
+        super().__init__(name)
+        self.suite = [
+            SubTask(
+                task_type="text-classification",
+                data="imdb",
+                split="test[:1]",
+                args_for_task={
+                    "metric": "accuracy",
+                    "input_column": "text",
+                    "label_column": "label",
+                    "label_mapping": {
+                        "LABEL_0": 0.0,
+                        "LABEL_1": 1.0
+                    }
+                }
+            ),
+            SubTask(
+                task_type="text-classification",
+                data="sst2",
+                split="test[:1]",
+                args_for_task={
+                    "metric": "accuracy",
+                    "input_column": "sentence",
+                    "label_column": "label",
+                    "label_mapping": {
+                        "LABEL_0": 0.0,
+                        "LABEL_1": 1.0
+                    }
+                }
+            )
+        ]
+```
+Evaluation can be run by loading the `EvaluationSuite` and calling the `run()` method with a model or pipeline.
+```
+>>> from evaluate import EvaluationSuite
+>>> suite = EvaluationSuite.load('mathemakitten/sentiment-evaluation-suite')
+>>> results = suite.run("huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli")
+```
+|   accuracy |   total_time_in_seconds |   samples_per_second |   latency_in_seconds | task_name   |
+|------------:|---------------------:|--------------------------:|:----------------|:-----------|
+|        0.3 |                4.62804  |              2.16074 |            0.462804  | imdb        |
+|        0   |                0.686388 |             14.569   |            0.0686388 | sst2        |

ST/evaluate/docs/source/base_evaluator.mdx ADDED Viewed

	@@ -0,0 +1,294 @@

+# Using the `evaluator`
+The `Evaluator` classes allow to evaluate a  triplet of model, dataset, and metric. The models wrapped in a pipeline, responsible for handling all preprocessing and post-processing and out-of-the-box, `Evaluator`s support transformers pipelines for the supported tasks, but custom pipelines can be passed, as showcased in the section [Using the `evaluator` with custom pipelines](custom_evaluator).
+Currently supported tasks are:
+- `"text-classification"`: will use the [`TextClassificationEvaluator`].
+- `"token-classification"`: will use the [`TokenClassificationEvaluator`].
+- `"question-answering"`: will use the [`QuestionAnsweringEvaluator`].
+- `"image-classification"`: will use the [`ImageClassificationEvaluator`].
+- `"text-generation"`: will use the [`TextGenerationEvaluator`].
+- `"text2text-generation"`: will use the [`Text2TextGenerationEvaluator`].
+- `"summarization"`: will use the [`SummarizationEvaluator`].
+- `"translation"`: will use the [`TranslationEvaluator`].
+- `"automatic-speech-recognition"`: will use the [`AutomaticSpeechRecognitionEvaluator`].
+- `"audio-classification"`: will use the [`AudioClassificationEvaluator`].
+To run an `Evaluator` with several tasks in a single call, use the [EvaluationSuite](evaluation_suite), which runs evaluations on a collection of `SubTask`s.
+Each task has its own set of requirements for the dataset format and pipeline output, make sure to check them out for your custom use case. Let's have a look at some of them and see how you can use the evaluator to evalute a single or multiple of models, datasets, and metrics at the same time.
+## Text classification
+The text classification evaluator can be used to evaluate text models on classification datasets such as IMDb. Beside the model, data, and metric inputs it takes the following optional inputs:
+- `input_column="text"`: with this argument the column with the data for the pipeline can be specified.
+- `label_column="label"`: with this argument the column with the labels for the evaluation can be specified.
+- `label_mapping=None`: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in `label_column` can be integers (`0`/`1`) whereas the pipeline can produce label names such as `"positive"`/`"negative"`. With that dictionary the pipeline outputs are mapped to the labels.
+By default the `"accuracy"` metric is computed.
+### Evaluate models on the Hub
+There are several ways to pass a model to the evaluator: you can pass the name of a model on the Hub, you can load a `transformers` model and pass it to the evaluator or you can pass an initialized `transformers.Pipeline`. Alternatively you can pass any callable function that behaves like a `pipeline` call for the task in any framework.
+So any of the following works:
+```py
+from datasets import load_dataset
+from evaluate import evaluator
+from transformers import AutoModelForSequenceClassification, pipeline
+data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
+task_evaluator = evaluator("text-classification")
+# 1. Pass a model name or path
+eval_results = task_evaluator.compute(
+    model_or_pipeline="lvwerra/distilbert-imdb",
+    data=data,
+    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
+)
+# 2. Pass an instantiated model
+model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")
+eval_results = task_evaluator.compute(
+    model_or_pipeline=model,
+    data=data,
+    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
+)
+# 3. Pass an instantiated pipeline
+pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb")
+eval_results = task_evaluator.compute(
+    model_or_pipeline=pipe,
+    data=data,
+    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
+)
+print(eval_results)
+```
+<Tip>
+Without specifying a device, the default for model inference will be the first GPU on the machine if one is available, and else CPU. If you want to use a specific device you can pass `device` to `compute` where -1 will use the GPU and a positive integer (starting with 0) will use the associated CUDA device.
+</Tip>
+The results will look as follows:
+```python
+{
+    'accuracy': 0.918,
+    'latency_in_seconds': 0.013,
+    'samples_per_second': 78.887,
+    'total_time_in_seconds': 12.676
+}
+```
+Note that evaluation results include both the requested metric, and information about the time it took to obtain predictions through the pipeline.
+<Tip>
+The time performances can give useful indication on model speed for inference but should be taken with a grain of salt: they include all the processing that goes on in the pipeline. This may include tokenizing, post-processing, that may be different depending on the model. Furthermore, it depends a lot on the hardware you are running the evaluation on and you may be able to improve the performance by optimizing things like the batch size.
+</Tip>
+### Evaluate multiple metrics
+With the [`combine`] function one can bundle several metrics into an object that behaves like a single metric. We can use this to evaluate several metrics at once with the evaluator:
+```python
+import evaluate
+eval_results = task_evaluator.compute(
+    model_or_pipeline="lvwerra/distilbert-imdb",
+    data=data,
+    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
+    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
+)
+print(eval_results)
+```
+The results will look as follows:
+```python
+{
+    'accuracy': 0.918,
+    'f1': 0.916,
+    'precision': 0.9147,
+    'recall': 0.9187,
+    'latency_in_seconds': 0.013,
+    'samples_per_second': 78.887,
+    'total_time_in_seconds': 12.676
+}
+```
+Next let's have a look at token classification.
+## Token Classification
+With the token classification evaluator one can evaluate models for tasks such as NER or POS tagging. It has the following specific arguments:
+- `input_column="text"`: with this argument the column with the data for the pipeline can be specified.
+- `label_column="label"`: with this argument the column with the labels for the evaluation can be specified.
+- `label_mapping=None`: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in `label_column` can be integers (`0`/`1`) whereas the pipeline can produce label names such as `"positive"`/`"negative"`. With that dictionary the pipeline outputs are mapped to the labels.
+- `join_by=" "`: While most datasets are already tokenized the pipeline expects a string. Thus the tokens need to be joined before passing to the pipeline. By default they are joined with a whitespace.
+Let's have a look how we can use the evaluator to benchmark several models.
+### Benchmarking several models
+Here is an example where several models can be compared thanks to the `evaluator` in only a few lines of code, abstracting away the preprocessing, inference, postprocessing, metric computation:
+```python
+import pandas as pd
+from datasets import load_dataset
+from evaluate import evaluator
+from transformers import pipeline
+models = [
+    "xlm-roberta-large-finetuned-conll03-english",
+    "dbmdz/bert-large-cased-finetuned-conll03-english",
+    "elastic/distilbert-base-uncased-finetuned-conll03-english",
+    "dbmdz/electra-large-discriminator-finetuned-conll03-english",
+    "gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner",
+    "philschmid/distilroberta-base-ner-conll2003",
+    "Jorgeutd/albert-base-v2-finetuned-ner",
+]
+data = load_dataset("conll2003", split="validation").shuffle().select(range(1000))
+task_evaluator = evaluator("token-classification")
+results = []
+for model in models:
+    results.append(
+        task_evaluator.compute(
+            model_or_pipeline=model, data=data, metric="seqeval"
+            )
+        )
+df = pd.DataFrame(results, index=models)
+df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]
+```
+The result is a table that looks like this:
+|   model                                                            |   overall_f1 |   overall_accuracy |   total_time_in_seconds |   samples_per_second |   latency_in_seconds |
+|:-------------------------------------------------------------------|-------------:|-------------------:|------------------------:|---------------------:|---------------------:|
+| Jorgeutd/albert-base-v2-finetuned-ner                              |        0.941 |              0.989 |                   4.515 |              221.468 |                0.005 |
+| dbmdz/bert-large-cased-finetuned-conll03-english                   |        0.962 |              0.881 |                  11.648 |               85.850 |                0.012 |
+| dbmdz/electra-large-discriminator-finetuned-conll03-english        |        0.965 |              0.881 |                  11.456 |               87.292 |                0.011 |
+| elastic/distilbert-base-uncased-finetuned-conll03-english          |        0.940 |              0.989 |                   2.318 |              431.378 |                0.002 |
+| gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner |        0.947 |              0.991 |                   2.376 |              420.873 |                0.002 |
+| philschmid/distilroberta-base-ner-conll2003                        |        0.961 |              0.994 |                   2.436 |              410.579 |                0.002 |
+| xlm-roberta-large-finetuned-conll03-english                        |        0.969 |              0.882 |                  11.996 |               83.359 |                0.012 |
+### Visualizing results
+You can feed in the `results` list above into the `plot_radar()` function to visualize different aspects of their performance and choose the model that is the best fit, depending on the metric(s) that are relevant to your use case:
+```python
+import evaluate
+from evaluate.visualization import radar_plot
+>>> plot = radar_plot(data=results, model_names=models, invert_range=["latency_in_seconds"])
+>>> plot.show()
+```
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/viz.png" width="400"/>
+</div>
+Don't forget to specify `invert_range` for metrics for which smaller is better (such as the case for latency in seconds).
+If you want to save the plot locally, you can use the `plot.savefig()` function with the option `bbox_inches='tight'`, to make sure no part of the image gets cut off.
+## Question Answering
+With the question-answering evaluator one can evaluate models for QA without needing to worry about the complicated pre- and post-processing that's required for these models. It has the following specific arguments:
+- `question_column="question"`: the name of the column containing the question in the dataset
+- `context_column="context"`: the name of the column containing the context
+- `id_column="id"`: the name of the column cointaing the identification field of the question and answer pair
+- `label_column="answers"`: the name of the column containing the answers
+- `squad_v2_format=None`: whether the dataset follows the format of squad_v2 dataset where a question may have no answer in the context. If this parameter is not provided, the format will be automatically inferred.
+Let's have a look how we can evaluate QA models and compute confidence intervals at the same time.
+### Confidence intervals
+Every evaluator comes with the options to compute confidence intervals using [bootstrapping](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html). Simply pass `strategy="bootstrap"` and set the number of resanmples with `n_resamples`.
+```python
+from datasets import load_dataset
+from evaluate import evaluator
+task_evaluator = evaluator("question-answering")
+data = load_dataset("squad", split="validation[:1000]")
+eval_results = task_evaluator.compute(
+    model_or_pipeline="distilbert-base-uncased-distilled-squad",
+    data=data,
+    metric="squad",
+    strategy="bootstrap",
+    n_resamples=30
+)
+```
+Results include confidence intervals as well as error estimates as follows:
+```python
+{
+    'exact_match':
+    {
+        'confidence_interval': (79.67, 84.54),
+        'score': 82.30,
+        'standard_error': 1.28
+    },
+    'f1':
+    {
+        'confidence_interval': (85.30, 88.88),
+        'score': 87.23,
+        'standard_error': 0.97
+    },
+    'latency_in_seconds': 0.0085,
+    'samples_per_second': 117.31,
+    'total_time_in_seconds': 8.52
+ }
+```
+## Image classification
+With the image classification evaluator we can evaluate any image classifier. It uses the same keyword arguments at the text classifier:
+- `input_column="image"`: the name of the column containing the images as PIL ImageFile
+- `label_column="label"`: the name of the column containing the labels
+- `label_mapping=None`: We want to map class labels defined by the model in the pipeline to values consistent with those defined in the `label_column`
+Let's have a look at how can evaluate image classification models on large datasets.
+### Handling large datasets
+The evaluator can be used on large datasets! Below, an example shows how to use it on ImageNet-1k for image classification. Beware that this example will require to download ~150 GB.
+```python
+data = load_dataset("imagenet-1k", split="validation", use_auth_token=True)
+pipe = pipeline(
+    task="image-classification",
+    model="facebook/deit-small-distilled-patch16-224"
+)
+task_evaluator = evaluator("image-classification")
+eval_results = task_evaluator.compute(
+    model_or_pipeline=pipe,
+    data=data,
+    metric="accuracy",
+    label_mapping=pipe.model.config.label2id
+)
+```
+Since we are using `datasets` to store data we make use of a technique called memory mappings. This means that the dataset is never fully loaded into memory which saves a lot of RAM. Running the above code only uses roughly 1.5 GB of RAM while the validation split is more than 30 GB big.

ST/evaluate/docs/source/choosing_a_metric.mdx ADDED Viewed

	@@ -0,0 +1,64 @@

+# Choosing a metric for your task
+**So you've trained your model and want to see how well it’s doing on a dataset of your choice. Where do you start?**
+There is no “one size fits all” approach to choosing an evaluation metric, but some good guidelines to keep in mind are:
+## Categories of metrics
+There are 3 high-level categories of metrics:
+1. *Generic metrics*, which can be applied to a variety of situations and datasets, such as precision and accuracy.
+2. *Task-specific metrics*, which are limited to a given task, such as Machine Translation (often evaluated using metrics [BLEU](https://huggingface.co/metrics/bleu) or [ROUGE](https://huggingface.co/metrics/rouge)) or Named Entity Recognition (often evaluated with [seqeval](https://huggingface.co/metrics/seqeval)).
+3. *Dataset-specific metrics*, which aim to measure model performance on specific benchmarks: for instance, the [GLUE benchmark](https://huggingface.co/datasets/glue) has a dedicated [evaluation metric](https://huggingface.co/metrics/glue).
+Let's look at each of these three cases:
+### Generic metrics
+Many of the metrics used in the Machine Learning community are quite generic and can be applied in a variety of tasks and datasets.
+This is the case for metrics like [accuracy](https://huggingface.co/metrics/accuracy) and [precision](https://huggingface.co/metrics/precision), which can be used for evaluating labeled (supervised) datasets, as well as [perplexity](https://huggingface.co/metrics/perplexity), which can be used for evaluating different kinds of (unsupervised) generative tasks.
+To see the input structure of a given metric, you can look at its metric card. For example, in the case of [precision](https://huggingface.co/metrics/precision), the format is:
+```
+>>> precision_metric = evaluate.load("precision")
+>>> results = precision_metric.compute(references=[0, 1], predictions=[0, 1])
+>>> print(results)
+{'precision': 1.0}
+```
+### Task-specific metrics
+Popular ML tasks like Machine Translation and Named Entity Recognition have specific metrics that can be used to compare models. For example, a series of different metrics have been proposed for text generation, ranging from [BLEU](https://huggingface.co/metrics/bleu) and its derivatives such as [GoogleBLEU](https://huggingface.co/metrics/google_bleu) and [GLEU](https://huggingface.co/metrics/gleu), but also [ROUGE](https://huggingface.co/metrics/rouge), [MAUVE](https://huggingface.co/metrics/mauve), etc.
+You can find the right metric for your task by:
+- **Looking at the [Task pages](https://huggingface.co/tasks)** to see what metrics can be used for evaluating models for a given task.
+- **Checking out leaderboards** on sites like [Papers With Code](https://paperswithcode.com/) (you can search by task and by dataset).
+-  **Reading the metric cards** for the relevant metrics and see which ones are a good fit for your use case. For example, see the [BLEU metric card](https://github.com/huggingface/evaluate/tree/main/metrics/bleu) or [SQuaD metric card](https://github.com/huggingface/evaluate/tree/main/metrics/squad).
+- **Looking at papers and blog posts** published on the topic and see what metrics they report. This can change over time, so try to pick papers from the last couple of years!
+### Dataset-specific metrics
+Some datasets have specific metrics associated with them -- this is especially in the case of popular benchmarks like [GLUE](https://huggingface.co/metrics/glue) and [SQuAD](https://huggingface.co/metrics/squad).
+<Tip warning={true}>
+💡
+GLUE is actually a collection of different subsets on different tasks, so first you need to choose the one that corresponds to the NLI task, such as mnli, which is described as “crowdsourced collection of sentence pairs with textual entailment annotations”
+</Tip>
+If you are evaluating your model on a benchmark dataset like the ones mentioned above, you can use its dedicated evaluation metric. Make sure you respect the format that they require. For example, to evaluate your model on the [SQuAD](https://huggingface.co/datasets/squad) dataset, you need to feed the `question` and `context` into your model and return the `prediction_text`, which should be compared with the `references` (based on matching the `id` of the question) :
+```
+>>> from evaluate import load
+>>> squad_metric = load("squad")
+>>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
+>>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
+>>> results = squad_metric.compute(predictions=predictions, references=references)
+>>> results
+{'exact_match': 100.0, 'f1': 100.0}
+```
+You can find examples of dataset structures by consulting the "Dataset Preview" function or the dataset card for a given dataset, and you can see how to use its dedicated evaluation function based on the metric card.

ST/evaluate/docs/source/considerations.mdx ADDED Viewed

	@@ -0,0 +1,88 @@

+# Considerations for model evaluation
+Developing an ML model is rarely a one-shot deal: it often involves multiple stages of defining the model architecture and tuning hyper-parameters before converging on a final set. Responsible model evaluation is a key part of this process, and 🤗 Evaluate is here to help!
+Here are some things to keep in mind when evaluating your model using the 🤗 Evaluate library:
+## Properly splitting your data
+Good evaluation generally requires three splits of your dataset:
+- **train**: this is used for training your model.
+- **validation**: this is used for validating the model hyperparameters.
+- **test**: this is used for evaluating your model.
+Many of the datasets on the 🤗 Hub are separated into 2 splits: `train` and `validation`; others are split into 3 splits (`train`, `validation` and `test`) -- make sure to use the right split for the right purpose!
+Some datasets on the 🤗 Hub are already separated into these three splits. However, there are also many that only have a train/validation or only train split.
+If the dataset you're using doesn't have a predefined train-test split, it is up to you to define which part of the dataset you want to use for training your model and  which you want to use for hyperparameter tuning or final evaluation.
+<Tip warning={true}>
+Training and evaluating on the same split can misrepresent your results! If you overfit on your training data the evaluation results on that split will look great but the model will perform poorly on new data.
+</Tip>
+Depending on the size of the dataset, you can keep anywhere from 10-30% for evaluation and the rest for training, while aiming to set up the test set to reflect the production data as close as possible. Check out [this thread](https://discuss.huggingface.co/t/how-to-split-main-dataset-into-train-dev-test-as-datasetdict/1090) for a more in-depth discussion of dataset splitting!
+## The impact of class imbalance
+While many academic datasets, such as the [IMDb dataset](https://huggingface.co/datasets/imdb) of movie reviews, are perfectly balanced, most real-world datasets are not. In machine learning a *balanced dataset* corresponds to a datasets where all labels are represented equally. In the case of the IMDb dataset this means that there are as many positive as negative reviews in the dataset. In an imbalanced dataset this is not the case: in fraud detection for example there are usually many more non-fraud cases than fraud cases in the dataset.
+Having an imbalanced dataset can skew the results of your metrics. Imagine a dataset with 99 "non-fraud" cases and 1 "fraud" case. A simple model that always predicts "non-fraud" cases would give yield a 99% accuracy which might sound good at first until you realize that you will never catch a fraud case.
+Often, using more than one metric can help get a better idea of your model’s performance from different points of view. For instance, metrics like **[recall](https://huggingface.co/metrics/recall)** and **[precision](https://huggingface.co/metrics/precision)** can be used together, and the **[f1 score](https://huggingface.co/metrics/f1)** is actually the harmonic mean of the two.
+In cases where a dataset is balanced, using [accuracy](https://huggingface.co/metrics/accuracy) can reflect the overall model performance:
+![Balanced Labels](https://huggingface.co/datasets/evaluate/media/resolve/main/balanced-classes.png)
+In cases where there is an imbalance, using [F1 score](https://huggingface.co/metrics/f1) can be a better representation of performance, given that it encompasses both precision and recall.
+![Imbalanced Labels](https://huggingface.co/datasets/evaluate/media/resolve/main/imbalanced-classes.png)
+Using accuracy in an imbalanced setting is less ideal, since it is not sensitive to minority classes and will not faithfully reflect model performance on them.
+## Offline vs. online model evaluation
+There are multiple ways to evaluate models, and an important distinction is offline versus online evaluation:
+**Offline evaluation** is done before deploying a model or using insights generated from a model, using static datasets and metrics.
+**Online evaluation** means evaluating how a model is performing after deployment and during its use in production.
+These two types of evaluation can use different metrics and measure different aspects of model performance. For example, offline evaluation can compare a model to other models based on their performance on common benchmarks, whereas online evaluation will evaluate aspects such as latency and accuracy of the model based on production data (for example, the number of user queries that it was able to address).
+## Trade-offs in model evaluation
+When evaluating models in practice, there are often trade-offs that have to be made between different aspects of model performance: for instance, choosing a model that is slightly less accurate but that has a faster inference time, compared to a high-accuracy that has a higher memory footprint and requires access to more GPUs.
+Here are other aspects of model performance to consider during evaluation:
+### Interpretability
+When evaluating models, **interpretability** (i.e. the ability to *interpret* results)  can be very important, especially when deploying models in production.
+For instance, metrics such as [exact match](https://huggingface.co/spaces/evaluate-metric/exact_match) have a set range (between 0 and 1, or 0% and 100%) and are easily understandable to users: for a pair of strings, the exact match score is 1 if the two strings are the exact same, and 0 otherwise.
+Other metrics, such as [BLEU](https://huggingface.co/spaces/evaluate-metric/exact_match) are harder to interpret: while they also range between 0 and 1, they can vary greatly depending on which parameters are used to generate the scores, especially when different tokenization and normalization techniques are used (see the [metric card](https://huggingface.co/spaces/evaluate-metric/bleu/blob/main/README.md) for more information about BLEU limitations). This means that it is difficult to interpret a BLEU score without having more information about the procedure used for obtaining it.
+Interpretability can be more or less important depending on the evaluation use case, but it is a useful aspect of model evaluation to keep in mind, since communicating and comparing model evaluations is an important part of responsible machine learning.
+### Inference speed and memory footprint
+While recent years have seen increasingly large ML models achieve high performance on a large variety of tasks and benchmarks, deploying these multi-billion parameter models in practice can be a challenge in itself, and many organizations lack the resources for this. This is why considering the **inference speed** and **memory footprint** of models is important, especially when doing online model evaluation.
+Inference speed refers to the time that it takes for a model to make a prediction -- this will vary depending on the hardware used and the way in which models are queried, e.g. in real time via an API or in batch jobs that run once a day.
+Memory footprint refers to the size of the model weights and how much hardware memory they occupy. If a model is too large to fit on a single GPU or CPU, then it has to be split over multiple ones, which can be more or less difficult depending on the model architecture and the deployment method.
+When doing online model evaluation, there is often a trade-off to be done between inference speed and accuracy or precision, whereas this is less the case for offline evaluation.
+## Limitations and bias
+All models and all metrics have their limitations and biases, which depend on the way in which they were trained, the data that was used, and their intended uses. It is important to measure and communicate these limitations clearly to prevent misuse and unintended impacts, for instance via [model cards](https://huggingface.co/course/chapter4/4?fw=pt) which document the training and evaluation process.
+Measuring biases can be done by evaluating models on datasets such as [Wino Bias](https://huggingface.co/datasets/wino_bias) or [MD Gender Bias](https://huggingface.co/datasets/md_gender_bias), and by doing [Interactive Error Analyis](https://huggingface.co/spaces/nazneen/error-analysis) to try to identify which subsets of the evaluation dataset a model performs poorly on.
+We are currently working on additional measurements that can be used to quantify different dimensions of bias in both models and datasets -- stay tuned for more documentation on this topic!

ST/evaluate/docs/source/creating_and_sharing.mdx ADDED Viewed

	@@ -0,0 +1,113 @@

+# Creating and sharing a new evaluation
+## Setup
+Before you can create a new metric make sure you have all the necessary dependencies installed:
+```bash
+pip install evaluate[template]
+```
+Also make sure your Hugging Face token is registered so you can connect to the Hugging Face Hub:
+```bash
+huggingface-cli login
+```
+## Create
+All evaluation modules, be it metrics, comparisons, or measurements live on the 🤗 Hub in a [Space](https://huggingface.co/docs/hub/spaces) (see for example [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)). In principle, you could setup a new Space and add a new module following the same structure. However, we added a CLI that makes creating a new evaluation module much easier:
+```bash
+evaluate-cli create "My Metric" --module_type "metric"
+```
+This will create a new Space on the 🤗 Hub, clone it locally, and populate it with a template. Instructions on how to fill the template will be displayed in the terminal, but are also explained here in more detail.
+For more information about Spaces, see the [Spaces documentation](https://huggingface.co/docs/hub/spaces).
+## Module script
+The evaluation module script (the file with suffix `*.py`) is the core of the new module and includes all the code for computing the evaluation.
+### Attributes
+Start by adding some information about your evalution module in [`EvaluationModule._info`]. The most important attributes you should specify are:
+1. [`EvaluationModuleInfo.description`] provides a brief description about your evalution module.
+2. [`EvaluationModuleInfo.citation`] contains a BibTex citation for the evalution module.
+3. [`EvaluationModuleInfo.inputs_description`] describes the expected inputs and outputs. It may also provide an example usage of the evalution module.
+4. [`EvaluationModuleInfo.features`] defines the name and type of the predictions and references. This has to be either a single `datasets.Features` object or a list of `datasets.Features` objects if multiple input types are allowed.
+Then, we can move on to prepare everything before the actual computation.
+### Download
+Some evaluation modules require some external data such as NLTK that requires resources or the BLEURT metric that requires checkpoints. You can implement these downloads in [`EvaluationModule._download_and_prepare`], which downloads and caches the resources via the `dlmanager`. A simplified example on how BLEURT downloads and loads a checkpoint:
+```py
+def _download_and_prepare(self, dl_manager):
+    model_path = dl_manager.download_and_extract(CHECKPOINT_URLS[self.config_name])
+    self.scorer = score.BleurtScorer(os.path.join(model_path, self.config_name))
+```
+Or if you need to download the NLTK `"punkt"` resources:
+```py
+def _download_and_prepare(self, dl_manager):
+    import nltk
+    nltk.download("punkt")
+```
+Next, we need to define how the computation of the evaluation module works.
+### Compute
+The computation is performed in the [`EvaluationModule._compute`] method. It takes the same arguments as `EvaluationModuleInfo.features` and should then return the result as a dictionary. Here an example of an exact match metric:
+```py
+def _compute(self, references, predictions):
+    em = sum([r==p for r, p in zip(references, predictions)])/len(references)
+    return {"exact_match": em}
+```
+This method is used when you call `.compute()` later on.
+## Readme
+When you use the `evalute-cli` to setup the evaluation module the Readme structure and instructions are automatically created. It should include a general description of the metric, information about its input/output format, examples as well as information about its limiations or biases and references.
+## Requirements
+If your evaluation modules has additional dependencies (e.g. `sklearn` or `nltk`) the `requirements.txt` files is the place to put them. The file follows the `pip` format and you can list all dependencies there.
+## App
+The `app.py` is where the Spaces widget lives. In general it looks like the following and does not require any changes:
+```py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("lvwerra/element_count")
+launch_gradio_widget(module)
+```
+If you want a custom widget you could add your gradio app here.
+## Push to Hub
+Finally, when you are done with all the above changes it is time to push your evaluation module to the hub. To do so navigate to the folder of your module and git add/commit/push the changes to the hub:
+```
+cd PATH_TO_MODULE
+git add .
+git commit -m "Add my new, shiny module."
+git push
+```
+Tada 🎉! Your evaluation module is now on the 🤗 Hub and ready to be used by everybody!

ST/evaluate/docs/source/custom_evaluator.mdx ADDED Viewed

	@@ -0,0 +1,114 @@

+# Using the `evaluator` with custom pipelines
+The evaluator is designed to work with `transformer` pipelines out-of-the-box. However, in many cases you might have a model or pipeline that's not part of the `transformer` ecosystem. You can still use `evaluator` to easily compute metrics for them. In this guide we show how to do this for a Scikit-Learn [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline) and a Spacy [pipeline](https://spacy.io). Let's start with the Scikit-Learn case.
+## Scikit-Learn
+First we need to train a model. We'll train a simple text classifier on the [IMDb dataset](https://huggingface.co/datasets/imdb), so let's start by downloading the dataset:
+```py
+from datasets import load_dataset
+ds = load_dataset("imdb")
+```
+Then we can build a simple TF-IDF preprocessor and Naive Bayes classifier wrapped in a `Pipeline`:
+```py
+from sklearn.pipeline import Pipeline
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+text_clf = Pipeline([
+        ('vect', CountVectorizer()),
+        ('tfidf', TfidfTransformer()),
+        ('clf', MultinomialNB()),
+])
+text_clf.fit(ds["train"]["text"], ds["train"]["label"])
+```
+Following the convention in the `TextClassificationPipeline` of `transformers` our pipeline should be callable and return a list of dictionaries. In addition we use the `task` attribute to check if the pipeline is compatible with the `evaluator`. We can write a small wrapper class for that purpose:
+```py
+class ScikitEvalPipeline:
+    def __init__(self, pipeline):
+        self.pipeline = pipeline
+        self.task = "text-classification"
+    def __call__(self, input_texts, **kwargs):
+        return [{"label": p} for p in self.pipeline.predict(input_texts)]
+pipe = ScikitEvalPipeline(text_clf)
+```
+We can now pass this `pipeline` to the `evaluator`:
+```py
+from evaluate import evaluator
+task_evaluator = evaluator("text-classification")
+task_evaluator.compute(pipe, ds["test"], "accuracy")
+>>> {'accuracy': 0.82956}
+```
+Implementing that simple wrapper is all that's needed to use any model from any framework with the `evaluator`. In the `__call__` you can implement all logic necessary for efficient forward passes through your model.
+## Spacy
+We'll use the `polarity` feature of the `spacytextblob` project to get a simple sentiment analyzer. First you'll need to install the project and download the resources:
+```bash
+pip install spacytextblob
+python -m textblob.download_corpora
+python -m spacy download en_core_web_sm
+```
+Then we can simply load the `nlp` pipeline and add the `spacytextblob` pipeline:
+```py
+import spacy
+nlp = spacy.load('en_core_web_sm')
+nlp.add_pipe('spacytextblob')
+```
+This snippet shows how we can use the `polarity` feature added with `spacytextblob` to get the sentiment of a text:
+```py
+texts = ["This movie is horrible", "This movie is awesome"]
+results = nlp.pipe(texts)
+for txt, res in zip(texts, results):
+    print(f"{text} | Polarity: {res._.blob.polarity}")
+```
+Now we can wrap it in a simple wrapper class like in the Scikit-Learn example before. It just has to return a list of dictionaries with the predicted lables. If the polarity is larger than 0 we'll predict positive sentiment and negative otherwise:
+```py
+class SpacyEvalPipeline:
+    def __init__(self, nlp):
+        self.nlp = nlp
+        self.task = "text-classification"
+    def __call__(self, input_texts, **kwargs):
+        results =[]
+        for p in self.nlp.pipe(input_texts):
+            if p._.blob.polarity>=0:
+                results.append({"label": 1})
+            else:
+                results.append({"label": 0})
+        return results
+pipe = SpacyEvalPipeline(nlp)
+```
+That class is compatible with the `evaluator` and we can use the same instance from the previous examlpe along with the IMDb test set:
+```py
+eval.compute(pipe, ds["test"], "accuracy")
+>>> {'accuracy': 0.6914}
+```
+This will take a little longer than the Scikit-Learn example but after roughly 10-15min you will have the evaluation results!

ST/evaluate/docs/source/evaluation_suite.mdx ADDED Viewed

	@@ -0,0 +1,74 @@

+# Creating an EvaluationSuite
+It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. Assessing the model on several types of tasks can reveal gaps in performance along some axis. For example, when training a language model, it is often useful to measure perplexity on an in-domain corpus, but also to concurrently evaluate on tasks which test for general language capabilities like natural language entailment or question-answering, or tasks designed to probe the model along fairness and bias dimensions.
+The `EvaluationSuite` provides a way to compose any number of ([evaluator](base_evaluator), dataset, metric) tuples as a SubTask to evaluate a model on a collection of several evaluation tasks. See the [evaluator documentation](base_evaluator) for a list of currently supported tasks.
+A new `EvaluationSuite` is made up of a list of `SubTask` classes, each defining an evaluation task. The Python file containing the definition can be uploaded to a Space on the Hugging Face Hub so it can be shared with the community or saved/loaded locally as a Python script.
+Some datasets require additional preprocessing before passing them to an `Evaluator`. You can set a `data_preprocessor` for each `SubTask` which is applied via a `map` operation using the `datasets` library. Keyword arguments for the `Evaluator` can be passed down through the `args_for_task` attribute.
+To create a new `EvaluationSuite`, create a [new Space](https://huggingface.co/new-space) with a .py file which matches the name of the Space, add the below template to a Python file, and fill in the attributes for a new task.
+The mandatory attributes for a new `SubTask` are `task_type` and `data`.
+1. [`task_type`] maps to the tasks currently supported by the Evaluator.
+2. [`data`] can be an instantiated Hugging Face dataset object or the name of a dataset.
+3. [`subset`] and [`split`] can be used to define which name and split of the dataset should be used for evaluation.
+4. [`args_for_task`] should be a dictionary with kwargs to be passed to the Evaluator.
+```python
+import evaluate
+from evaluate.evaluation_suite import SubTask
+class Suite(evaluate.EvaluationSuite):
+    def __init__(self, name):
+        super().__init__(name)
+        self.preprocessor = lambda x: {"text": x["text"].lower()}
+        self.suite = [
+            SubTask(
+                task_type="text-classification",
+                data="glue",
+                subset="sst2",
+                split="validation[:10]",
+                args_for_task={
+                    "metric": "accuracy",
+                    "input_column": "sentence",
+                    "label_column": "label",
+                    "label_mapping": {
+                        "LABEL_0": 0.0,
+                        "LABEL_1": 1.0
+                    }
+                }
+            ),
+            SubTask(
+                task_type="text-classification",
+                data="glue",
+                subset="rte",
+                split="validation[:10]",
+                args_for_task={
+                    "metric": "accuracy",
+                    "input_column": "sentence1",
+                    "second_input_column": "sentence2",
+                    "label_column": "label",
+                    "label_mapping": {
+                        "LABEL_0": 0,
+                        "LABEL_1": 1
+                    }
+                }
+            )
+        ]
+```
+An `EvaluationSuite` can be loaded by name from the Hugging Face Hub, or locally by providing a path, and run with the `run(model_or_pipeline)` method. The evaluation results are returned along with their task names and information about the time it took to obtain predictions through the pipeline. These can be easily displayed with a `pandas.DataFrame`:
+```
+>>> from evaluate import EvaluationSuite
+>>> suite = EvaluationSuite.load('mathemakitten/glue-evaluation-suite')
+>>> results = suite.run("gpt2")
+```
+|   accuracy |   total_time_in_seconds |   samples_per_second |   latency_in_seconds | task_name   |
+|-----------:|------------------------:|---------------------:|---------------------:|:------------|
+|        0.5 |                0.740811 |             13.4987  |            0.0740811 | glue/sst2   |
+|        0.4 |                1.67552  |              5.9683  |            0.167552  | glue/rte    |

ST/evaluate/docs/source/index.mdx ADDED Viewed

	@@ -0,0 +1,34 @@

+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/evaluate-banner.png" width="400"/>
+    <br>
+</p>
+# 🤗 Evaluate
+A library for easily evaluating machine learning models and datasets.
+With a single line of code, you get access to dozens of evaluation methods for different domains (NLP, Computer Vision, Reinforcement Learning, and more!). Be it on your local machine or in a distributed training setup, you can evaluate your models in a consistent and reproducible way!
+Visit the 🤗 Evaluate [organization](https://huggingface.co/evaluate-metric) for a full list of available metrics. Each metric has a dedicated Space with an interactive demo for how to use the metric, and a documentation card detailing the metrics limitations and usage.
+<div class="mt-10">
+  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./installation"
+      ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
+      <p class="text-gray-700">Learn the basics and become familiar with loading, computing, and saving with 🤗 Evaluate. Start here if you are using 🤗 Evaluate for the first time!</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./choosing_a_metric"
+      ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
+      <p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Evaluate to solve real-world problems.</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./types_of_evaluations"
+      ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
+      <p class="text-gray-700">High-level explanations for building a better understanding of important topics such as considerations going into evaluating a model or dataset and the difference between metrics, measurements, and comparisons.</p>
+   </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./package_reference/main_classes"
+      ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
+      <p class="text-gray-700">Technical descriptions of how 🤗 Evaluate classes and methods work.</p>
+    </a>
+  </div>
+</div>

ST/evaluate/docs/source/installation.mdx ADDED Viewed

	@@ -0,0 +1,68 @@

+# Installation
+Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Evaluate is tested on **Python 3.7+**.
+## Virtual environment
+You should install 🤗 Evaluate in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep everything neat and tidy.
+1. Create and navigate to your project directory:
+   ```bash
+   mkdir ~/my-project
+   cd ~/my-project
+   ```
+2. Start a virtual environment inside the directory:
+   ```bash
+   python -m venv .env
+   ```
+3. Activate and deactivate the virtual environment with the following commands:
+   ```bash
+   # Activate the virtual environment
+   source .env/bin/activate
+   # Deactivate the virtual environment
+   source .env/bin/deactivate
+   ```
+Once you have created your virtual environment, you can install 🤗 Evaluate in it.
+## pip
+The most straightforward way to install 🤗 Evaluate is with pip:
+```bash
+pip install evaluate
+```
+Run the following command to check if 🤗 Evaluate has been properly installed:
+```bash
+python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
+```
+This should return:
+```bash
+{'exact_match': 1.0}
+```
+## source
+Building 🤗 Evaluate from source lets you make changes to the code base. To install from source, clone the repository and install with the following commands:
+```bash
+git clone https://github.com/huggingface/evaluate.git
+cd evaluate
+pip install -e .
+```
+Again, you can check if 🤗 Evaluate has been properly installed with:
+```bash
+python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
+```

ST/evaluate/docs/source/keras_integrations.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# Working with Keras and Tensorflow
+Evaluate can be easily intergrated into your Keras and Tensorflow workflow. We'll demonstrate two ways of incorporating Evaluate into model training, using the Fashion MNIST example dataset. We'll train a standard classifier to predict two classes from this dataset, and show how to use a metric as a callback during training or afterwards for evaluation.
+```python
+import numpy as np
+from tensorflow import keras
+from tensorflow.keras import layers
+import evaluate
+# We pull example code from Keras.io's guide on classifying with MNIST
+# Located here: https://keras.io/examples/vision/mnist_convnet/
+# Model / data parameters
+input_shape = (28, 28, 1)
+# Load the data and split it between train and test sets
+(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
+# Only select tshirts/tops and trousers, classes 0 and 1
+def get_tshirts_tops_and_trouser(x_vals, y_vals):
+    mask = np.where((y_vals == 0) | (y_vals == 1))
+    return x_vals[mask], y_vals[mask]
+x_train, y_train = get_tshirts_tops_and_trouser(x_train, y_train)
+x_test, y_test = get_tshirts_tops_and_trouser(x_test, y_test)
+# Scale images to the [0, 1] range
+x_train = x_train.astype("float32") / 255
+x_test = x_test.astype("float32") / 255
+x_train = np.expand_dims(x_train, -1)
+x_test = np.expand_dims(x_test, -1)
+model = keras.Sequential(
+    [
+        keras.Input(shape=input_shape),
+        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+        layers.MaxPooling2D(pool_size=(2, 2)),
+        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
+        layers.MaxPooling2D(pool_size=(2, 2)),
+        layers.Flatten(),
+        layers.Dropout(0.5),
+        layers.Dense(1, activation="sigmoid"),
+    ]
+)
+```
+## Callbacks
+Suppose we want to keep track of model metrics while a model is training. We can use a Callback in order to calculate this metric during training, after an epoch ends.
+We'll define a callback here that will take a metric name and our training data, and have it calculate a metric after the epoch ends.
+```python
+class MetricsCallback(keras.callbacks.Callback):
+    def __init__(self, metric_name, x_data, y_data) -> None:
+        super(MetricsCallback, self).__init__()
+        self.x_data = x_data
+        self.y_data = y_data
+        self.metric_name = metric_name
+        self.metric = evaluate.load(metric_name)
+    def on_epoch_end(self, epoch, logs=dict()):
+        m = self.model
+        # Ensure we get labels of "1" or "0"
+        training_preds = np.round(m.predict(self.x_data))
+        training_labels = self.y_data
+        # Compute score and save
+        score = self.metric.compute(predictions = training_preds, references = training_labels)
+        logs.update(score)
+```
+We can pass this class to the `callbacks` keyword-argument to use it during training:
+```python
+batch_size = 128
+epochs = 2
+model.compile(loss="binary_crossentropy", optimizer="adam")
+model_history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1,
+callbacks = [MetricsCallback(x_data = x_train, y_data = y_train, metric_name = "accuracy")])
+```
+## Using an Evaluate Metric for... Evaluation!
+We can also use the same metric after model training! Here, we show how to check accuracy of the model after training on the test set:
+```python
+acc = evaluate.load("accuracy")
+# Round the predictions to turn them into "0" or "1" labels
+test_preds = np.round(model.predict(x_test))
+test_labels = y_test
+```
+```python
+print("Test accuracy is : ", acc.compute(predictions = test_preds, references = test_labels))
+# Test accuracy is : 0.9855
+```

ST/evaluate/docs/source/package_reference/evaluator_classes.mdx ADDED Viewed

	@@ -0,0 +1,63 @@

+# Evaluator
+The evaluator classes for automatic evaluation.
+## Evaluator classes
+The main entry point for using the evaluator:
+[[autodoc]] evaluate.evaluator
+The base class for all evaluator classes:
+[[autodoc]] evaluate.Evaluator
+## The task specific evaluators
+### ImageClassificationEvaluator
+[[autodoc]] evaluate.ImageClassificationEvaluator
+### QuestionAnsweringEvaluator
+[[autodoc]] evaluate.QuestionAnsweringEvaluator
+    - compute
+### TextClassificationEvaluator
+[[autodoc]] evaluate.TextClassificationEvaluator
+### TokenClassificationEvaluator
+[[autodoc]] evaluate.TokenClassificationEvaluator
+    - compute
+### TextGenerationEvaluator
+[[autodoc]] evaluate.TextGenerationEvaluator
+    - compute
+### Text2TextGenerationEvaluator
+[[autodoc]] evaluate.Text2TextGenerationEvaluator
+    - compute
+### SummarizationEvaluator
+[[autodoc]] evaluate.SummarizationEvaluator
+    - compute
+### TranslationEvaluator
+[[autodoc]] evaluate.TranslationEvaluator
+    - compute
+### AutomaticSpeechRecognitionEvaluator
+[[autodoc]] evaluate.AutomaticSpeechRecognitionEvaluator
+    - compute
+### AudioClassificationEvaluator
+[[autodoc]] evaluate.AudioClassificationEvaluator
+    - compute

ST/evaluate/docs/source/package_reference/hub_methods.mdx ADDED Viewed

	@@ -0,0 +1,8 @@

+# Hub methods
+Methods for using the Hugging Face Hub:
+## Push to hub
+[[autodoc]] evaluate.push_to_hub

ST/evaluate/docs/source/package_reference/loading_methods.mdx ADDED Viewed

	@@ -0,0 +1,11 @@

+# Loading methods
+Methods for listing and loading evaluation modules:
+## List
+[[autodoc]] evaluate.list_evaluation_modules
+## Load
+[[autodoc]] evaluate.load

ST/evaluate/docs/source/package_reference/logging_methods.mdx ADDED Viewed

	@@ -0,0 +1,89 @@

+# Logging methods
+🤗 Evaluate strives to be transparent and explicit about how it works, but this can be quite verbose at times. We have included a series of logging methods which allow you to easily adjust the level of verbosity of the entire library. Currently the default verbosity of the library is set to `WARNING`.
+To change the level of verbosity, use one of the direct setters. For instance, here is how to change the verbosity to the `INFO` level:
+```py
+import evaluate
+evaluate.logging.set_verbosity_info()
+```
+You can also use the environment variable `EVALUATE_VERBOSITY` to override the default verbosity, and set it to one of the following: `debug`, `info`, `warning`, `error`, `critical`:
+```bash
+EVALUATE_VERBOSITY=error ./myprogram.py
+```
+All the methods of this logging module are documented below. The main ones are:
+- [`logging.get_verbosity`] to get the current level of verbosity in the logger
+- [`logging.set_verbosity`] to set the verbosity to the level of your choice
+In order from the least to the most verbose (with their corresponding `int` values):
+1. `logging.CRITICAL` or `logging.FATAL` (int value, 50): only report the most critical errors.
+2. `logging.ERROR` (int value, 40): only report errors.
+3. `logging.WARNING` or `logging.WARN` (int value, 30): only reports error and warnings. This the default level used by the library.
+4. `logging.INFO` (int value, 20): reports error, warnings and basic information.
+5. `logging.DEBUG` (int value, 10): report all information.
+By default, `tqdm` progress bars will be displayed during evaluate download and processing. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
+## Functions
+[[autodoc]] evaluate.logging.get_verbosity
+[[autodoc]] evaluate.logging.set_verbosity
+[[autodoc]] evaluate.logging.set_verbosity_info
+[[autodoc]] evaluate.logging.set_verbosity_warning
+[[autodoc]] evaluate.logging.set_verbosity_debug
+[[autodoc]] evaluate.logging.set_verbosity_error
+[[autodoc]] evaluate.logging.disable_propagation
+[[autodoc]] evaluate.logging.enable_propagation
+[[autodoc]] evaluate.logging.get_logger
+[[autodoc]] evaluate.logging.enable_progress_bar
+[[autodoc]] evaluate.logging.disable_progress_bar
+## Levels
+### evaluate.logging.CRITICAL
+evaluate.logging.CRITICAL = 50
+### evaluate.logging.DEBUG
+evaluate.logging.DEBUG = 10
+### evaluate.logging.ERROR
+evaluate.logging.ERROR = 40
+### evaluate.logging.FATAL
+evaluate.logging.FATAL = 50
+### evaluate.logging.INFO
+evaluate.logging.INFO = 20
+### evaluate.logging.NOTSET
+evaluate.logging.NOTSET = 0
+### evaluate.logging.WARN
+evaluate.logging.WARN = 30
+### evaluate.logging.WARNING
+evaluate.logging.WARNING = 30

ST/evaluate/docs/source/package_reference/main_classes.mdx ADDED Viewed

	@@ -0,0 +1,33 @@

+# Main classes
+## EvaluationModuleInfo
+The base class `EvaluationModuleInfo` implements a the logic for the subclasses `MetricInfo`, `ComparisonInfo`, and `MeasurementInfo`.
+[[autodoc]] evaluate.EvaluationModuleInfo
+[[autodoc]] evaluate.MetricInfo
+[[autodoc]] evaluate.ComparisonInfo
+[[autodoc]] evaluate.MeasurementInfo
+## EvaluationModule
+The base class `EvaluationModule` implements a the logic for the subclasses `Metric`, `Comparison`, and `Measurement`.
+[[autodoc]] evaluate.EvaluationModule
+[[autodoc]] evaluate.Metric
+[[autodoc]] evaluate.Comparison
+[[autodoc]] evaluate.Measurement
+## CombinedEvaluations
+The `combine` function allows to combine multiple `EvaluationModule`s into a single `CombinedEvaluations`.
+[[autodoc]] evaluate.combine
+[[autodoc]] CombinedEvaluations

ST/evaluate/docs/source/package_reference/saving_methods.mdx ADDED Viewed

	@@ -0,0 +1,8 @@

+# Saving methods
+Methods for saving evaluations results:
+## Save
+[[autodoc]] evaluate.save

ST/evaluate/docs/source/package_reference/visualization_methods.mdx ADDED Viewed

	@@ -0,0 +1,7 @@

+# Visualization methods
+Methods for visualizing evaluations results:
+## Radar Plot
+[[autodoc]] evaluate.visualization.radar_plot

ST/evaluate/docs/source/sklearn_integrations.mdx ADDED Viewed

	@@ -0,0 +1,88 @@

+# Scikit-Learn
+To run the scikit-learn examples make sure you have installed the following library:
+```bash
+pip install -U scikit-learn
+```
+The metrics in `evaluate` can be easily integrated with an Scikit-Learn estimator or [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline).
+However, these metrics require that we generate the predictions from the model. The predictions and labels from the estimators can be passed to `evaluate` mertics to compute the required values.
+```python
+import numpy as np
+np.random.seed(0)
+import evaluate
+from sklearn.compose import ColumnTransformer
+from sklearn.datasets import fetch_openml
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+```
+Load data from https://www.openml.org/d/40945:
+```python
+X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
+```
+Alternatively X and y can be obtained directly from the frame attribute:
+```python
+X = titanic.frame.drop('survived', axis=1)
+y = titanic.frame['survived']
+```
+We create the preprocessing pipelines for both numeric and categorical data. Note that pclass could either be treated as a categorical or numeric feature.
+```python
+numeric_features = ["age", "fare"]
+numeric_transformer = Pipeline(
+    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
+)
+categorical_features = ["embarked", "sex", "pclass"]
+categorical_transformer = OneHotEncoder(handle_unknown="ignore")
+preprocessor = ColumnTransformer(
+    transformers=[
+        ("num", numeric_transformer, numeric_features),
+        ("cat", categorical_transformer, categorical_features),
+    ]
+)
+```
+Append classifier to preprocessing pipeline. Now we have a full prediction pipeline.
+```python
+clf = Pipeline(
+    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+```
+As `Evaluate` metrics use lists as inputs for references and predictions, we need to convert them to Python lists.
+```python
+# Evaluate metrics accept lists as inputs for values of references and predictions
+y_test = y_test.tolist()
+y_pred = y_pred.tolist()
+# Accuracy
+accuracy_metric = evaluate.load("accuracy")
+accuracy = accuracy_metric.compute(references=y_test, predictions=y_pred)
+print("Accuracy:", accuracy)
+# Accuracy: 0.79
+```
+You can use any suitable `evaluate` metric with the estimators as long as they are compatible with the task and predictions.