kye
/

Andromeda

Model card Files Files and versions Community

kye commited on Aug 16, 2023

Commit

ca4fc4d

1 Parent(s): f47571d

Upload 73 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/ISSUE_TEMPLATE/---bug-report.md +36 -0
.github/ISSUE_TEMPLATE/---feature-request.md +25 -0
.github/ISSUE_TEMPLATE/---model-questions.md +17 -0
.github/mcp/mcp_pytest.py +139 -0
.github/workflows/FUNDING.md +13 -0
.github/workflows/code-quality.yaml +44 -0
.github/workflows/codeql-analysis.yml +70 -0
.github/workflows/coverage.yaml +32 -0
.github/workflows/docker.yaml +62 -0
.github/workflows/pr-cpu.yaml +43 -0
.github/workflows/pr-gpu.yaml +40 -0
.github/workflows/pytest-cpu.yaml +48 -0
.github/workflows/pytest-gpu.yaml +80 -0
.github/workflows/python-publish.yml +39 -0
.github/workflows/release.yaml +60 -0
.gitignore +2 -0
Andromeda/README.md +72 -167
Andromeda/__init__.py +3 -0
Andromeda/configs.py +128 -0
Andromeda/core/__init__.py +8 -0
Andromeda/core/attend.py +252 -0
Andromeda/core/autoregressive_wrapper.py +150 -0
Andromeda/core/flash.py +289 -0
Andromeda/core/transformer.py +1376 -0
Andromeda/dataset_prep/__init__.py +0 -0
Andromeda/dataset_prep/books.py +12 -0
Andromeda/inference.py +198 -0
Andromeda/model.py +118 -0
Andromeda/old/__init__.py +0 -0
Andromeda/old/sophia.py +200 -0
Andromeda/old/training.py +294 -0
Andromeda/old/training_1.py +350 -0
Andromeda/old/training_sophia.py +369 -0
Andromeda/train.py +27 -32
Andromeda/utils/__init__.py +0 -0
Andromeda/utils/decoupled_optimizer.py +147 -0
Andromeda/utils/helpers.py +17 -0
Andromeda/utils/rf_utils.py +186 -0
Andromeda/utils/stable_adamw.py +96 -0
DOCs/Corporation/MONETIZATION.md +51 -0
DOCs/Design/Dyson.md +26 -0
DOCs/Design/MODEL_ARCHITECTURE.md +57 -0
DOCs/Design/SPEED.md +11 -0
DOCs/Design/Specs.md +196 -0
DOCs/Docs/DOCUMENTATION.md +145 -0
DOCs/Docs/TRAINING.md +82 -0
DOCs/Docs/Training/DATASET_STRATEGY.md +100 -0
DOCs/Tests/BENCHMARKING.md +111 -0
FailureAnalysis/CPU_MEMORY.md +489 -0
FailureAnalysis/OptimizerDict.md +238 -0

.github/ISSUE_TEMPLATE/---bug-report.md ADDED Viewed

	@@ -0,0 +1,36 @@

+---
+name: "\U0001F41B Bug report"
+about: Submit a bug report to improve our library!
+title: ''
+labels: bug
+assignees: ''
+---
+<!-- Please check for related issues (both open and closed) before filing this issue. -->
+## Environment
+<!-- Please copy paste the output of running `composer_collect_env` below-->
+<!--
+    If you can't install composer for some reason, you can also use the PyTorch collect env script
+    wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
+    # For security purposes, please check the contents of collect_env.py before running it.
+    python collect_env.py
+-->
+## To reproduce
+Steps to reproduce the behavior:
+1.
+2.
+3.
+## Expected behavior
+<!-- A clear and concise description of what you would expect to happen. -->
+## Additional context
+<!-- Please provide any additional context. -->

.github/ISSUE_TEMPLATE/---feature-request.md ADDED Viewed

	@@ -0,0 +1,25 @@

+---
+name: "\U0001F680 Feature request"
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+---
+<!-- Please check for related feature requests (both open and closed) before filing this request. -->
+## 🚀 Feature Request
+<!-- A clear and concise description of the feature proposal -->
+## Motivation
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+## [Optional] Implementation
+<!-- Optionally, sketch out an implementation or interface needed. -->
+## Additional context
+<!-- Add any other context or screenshots about the feature request here. -->

.github/ISSUE_TEMPLATE/---model-questions.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+name: "\U00002753 Model-related question"
+about: Ask a question about using our released models
+title: ''
+labels: question
+assignees: ''
+---
+<!-- Please check for related question (both open and closed) before filing this question. -->
+## ❓ Question
+<!-- A clear and concise description of the question -->
+## Additional context
+<!-- Add any other context or screenshots about the feature request here. -->

.github/mcp/mcp_pytest.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+"""Run pytest using MCP."""
+import argparse
+import time
+from mcli.sdk import (RunConfig, RunStatus, create_run, follow_run_logs,
+                      stop_run, wait_for_run_status)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name',
+                        type=str,
+                        default='mcp-pytest',
+                        help='Base name of run')
+    parser.add_argument('--cluster',
+                        type=str,
+                        default='r1z4',
+                        help='Cluster to use')
+    parser.add_argument('--gpu_type',
+                        type=str,
+                        default='a100_40gb',
+                        help='Type of GPU to use')
+    parser.add_argument('--gpu_num',
+                        type=int,
+                        default=2,
+                        help='Number of the GPU to use')
+    parser.add_argument('--image',
+                        type=str,
+                        default='mosaicml/pytorch:latest',
+                        help='Docker image to use')
+    parser.add_argument('--git_branch',
+                        type=str,
+                        help='Git branch to check out')
+    parser.add_argument(
+        '--git_commit',
+        type=str,
+        help='Git commit to check out. Overrides git_branch if specified')
+    parser.add_argument(
+        '--pr_number',
+        type=int,
+        help=
+        'PR number to check out. Overrides git_branch/git_commit if specified')
+    parser.add_argument('--pytest_markers',
+                        type=str,
+                        help='Markers to pass to pytest')
+    parser.add_argument('--pytest_command',
+                        type=str,
+                        help='Command to run pytest')
+    parser.add_argument('--timeout',
+                        type=int,
+                        default=1800,
+                        help='Timeout for run (in seconds)')
+    args = parser.parse_args()
+    name = args.name
+    git_integration = {
+        'integration_type': 'git_repo',
+        'git_repo': 'mosaicml/llm-foundry',
+        'ssh_clone': 'False',
+    }
+    if args.git_branch is not None and args.git_commit is None:
+        name += f'-branch-{args.git_branch}'
+        git_integration['git_branch'] = args.git_branch
+    if args.git_commit is not None:
+        name += f'-commit-{args.git_commit}'
+        git_integration['git_commit'] = args.git_commit
+    command = 'cd llm-foundry'
+    # Checkout a specific PR if specified
+    if args.pr_number is not None:
+        name += f'-pr-{args.pr_number}'
+        command += f'''
+        git fetch origin pull/{args.pr_number}/head:pr_branch
+        git checkout pr_branch
+        '''
+    # Shorten name if too long
+    if len(name) > 56:
+        name = name[:56]
+    command += f'''
+    pip install --upgrade --user .[all]
+    export COMMON_ARGS="-v --durations=20 -m '{args.pytest_markers}'"
+    make test PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
+    make test-dist PYTEST='{args.pytest_command}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
+    python -m coverage combine
+    python -m coverage report
+    '''
+    config = RunConfig(
+        name=name,
+        cluster=args.cluster,
+        gpu_type=args.gpu_type,
+        gpu_num=args.gpu_num,
+        image=args.image,
+        integrations=[git_integration],
+        command=command,
+    )
+    # Create run
+    run = create_run(config)
+    print(f'[GHA] Run created: {run.name}')
+    # Wait until run starts before fetching logs
+    run = wait_for_run_status(run, status='running')
+    start_time = time.time()
+    print('[GHA] Run started. Following logs...')
+    # Print logs
+    for line in follow_run_logs(run):
+        print(line, end='')
+        # Check if args.timeout seconds have elapsed
+        if time.time() - start_time > args.timeout:
+            print(
+                f'[GHA] Run timed out and did not complete in {args.timeout/60} minutes.'
+            )
+            run = stop_run(run)
+            print('[GHA] Run stopped.')
+            break
+    print('[GHA] Run completed. Waiting for run to finish...')
+    run = wait_for_run_status(run, status='completed')
+    # Fail if command exited with non-zero exit code or timed out
+    assert run.status == RunStatus.COMPLETED

.github/workflows/FUNDING.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# These are supported funding model platforms
+github: [kyegomez]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: #Nothing

.github/workflows/code-quality.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: Code Quality Checks
+on:
+  push:
+    branches:
+    - main
+    - release/**
+  pull_request:
+    branches:
+    - main
+    - release/**
+  workflow_call:
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+defaults:
+  run:
+    working-directory: .
+jobs:
+  code-quality:
+    runs-on: ubuntu-20.04
+    timeout-minutes: 10
+    strategy:
+      matrix:
+        python_version:
+        - '3.8'
+        - '3.9'
+        - '3.10'
+        pip_deps:
+        - '[dev]'
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python_version }}
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .${{ matrix.pip_deps }}
+    - name: Run checks
+      run: |
+        pre-commit run --all-files

.github/workflows/codeql-analysis.yml ADDED Viewed

	@@ -0,0 +1,70 @@

+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: 'CodeQL'
+on:
+  push:
+    branches: [main]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [main]
+  schedule:
+  - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ['python']
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://git.io/codeql-language-support
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 https://git.io/JvXDl
+    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+    #    and modify them (or add more) to build your code if your project
+    #    uses a compiled language
+    # - run: |
+    #   make bootstrap
+    #   make release
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2

.github/workflows/coverage.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+name: PyTest Coverage
+on:
+  workflow_call:
+    inputs:
+      download-path:
+        required: true
+        type: string
+jobs:
+  coverage:
+    timeout-minutes: 5
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        set -ex
+        python -m pip install --upgrade 'pip<23' wheel
+        pip install coverage[toml]==6.5.0
+    - name: Download artifacts
+      uses: actions/download-artifact@v3
+      with:
+        path: ${{ inputs.download-path }}
+    - name: Generate coverage report
+      run: |
+        set -ex
+        # Flatten the coverage files
+        ls ${{ inputs.download-path }} | while read x; do mv ${{ inputs.download-path }}/$x/.coverage .coverage.$x; done
+        python -m coverage combine
+        python -m coverage report

.github/workflows/docker.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+name: Docker
+on:
+  push:
+    branches:
+    - main
+  workflow_dispatch: {}
+jobs:
+  docker-build:
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'mosaicml'
+    strategy:
+      matrix:
+        include:
+        - name: '1.13.1_cu117'
+          base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+        - name: '2.0.1_cu118'
+          base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+    steps:
+    - name: Maximize Build Space on Worker
+      uses: easimon/maximize-build-space@v4
+      with:
+        overprovision-lvm: true
+        remove-dotnet: true
+        remove-android: true
+        remove-haskell: true
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Setup QEMU
+      uses: docker/setup-qemu-action@v2
+    - name: Setup Docker Buildx
+      uses: docker/setup-buildx-action@v2
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKER_HUB_USERNAME }}
+        password: ${{ secrets.DOCKER_HUB_PASSWORD }}
+    - name: Calculate Docker Image Variables
+      run: |
+        set -euxo pipefail
+        ###################
+        # Calculate the tag
+        ###################
+        GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
+        echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV}
+    - name: Build and Push the Docker Image
+      uses: docker/build-push-action@v3
+      with:
+        context: .
+        tags: mosaicml/llm-foundry:${{ matrix.name }}-latest,
+          mosaicml/llm-foundry:${{ matrix.name }}-${{ env.IMAGE_TAG }}
+        push: true
+        cache-from: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache
+        cache-to: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache,mode=max
+        build-args: BASE_IMAGE=${{ matrix.base_image }}

.github/workflows/pr-cpu.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+name: PR CPU tests
+on:
+  push:
+    branches:
+    - main
+    - release/*
+  pull_request:
+    branches:
+    - main
+    - release/*
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+jobs:
+  pytest-cpu:
+    uses: ./.github/workflows/pytest-cpu.yaml
+    strategy:
+      matrix:
+        include:
+        - name: 'cpu-latest'
+          container: mosaicml/pytorch:latest_cpu  # mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
+          markers: 'not gpu'
+          pytest_command: 'coverage run -m pytest'
+        - name: 'cpu-2.0.1'
+          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+          markers: 'not gpu'
+          pytest_command: 'coverage run -m pytest'
+    name: ${{ matrix.name }}
+    if: github.repository_owner == 'mosaicml'
+    with:
+      container: ${{ matrix.container }}
+      name: ${{ matrix.name }}
+      pytest-command: ${{ matrix.pytest_command }}
+      pytest-markers: ${{ matrix.markers }}
+  coverage:
+    uses: ./.github/workflows/coverage.yaml
+    name: Coverage Results
+    if: github.repository_owner == 'mosaicml'
+    needs: [pytest-cpu]
+    with:
+      download-path: artifacts

.github/workflows/pr-gpu.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: PR GPU tests
+on:
+  push:
+    branches:
+    - main
+    - release/*
+  pull_request_target:
+    branches:
+    - main
+    - release/**
+  workflow_dispatch:
+# Cancel old runs when a new commit is pushed to the same branch if not on main or dev
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+jobs:
+  pytest-gpu:
+    uses: ./.github/workflows/pytest-gpu.yaml
+    strategy:
+      matrix:
+        include:
+        - name: 'gpu-latest'
+          container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
+        - name: 'gpu-2.0.1'
+          container: mosaicml/pytorch:2.0.1_cu117-python3.10-ubuntu20.04
+          markers: 'gpu'
+          pytest_command: 'coverage run -m pytest'
+    name: ${{ matrix.name }}
+    if: github.repository_owner == 'mosaicml'
+    with:
+      container: ${{ matrix.container }}
+      mcloud-timeout: 1200
+      name: ${{ matrix.name }}
+      pytest-command: ${{ matrix.pytest_command }}
+      pytest-markers: ${{ matrix.markers }}
+      python-version: 3.9
+    secrets:
+      mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}

.github/workflows/pytest-cpu.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+name: Pytest CPU
+on:
+  workflow_call:
+    inputs:
+      container:
+        required: true
+        type: string
+      name:
+        required: true
+        type: string
+      pytest-command:
+        required: true
+        type: string
+      pytest-markers:
+        required: true
+        type: string
+jobs:
+  pytest-cpu:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+    container: ${{ inputs.container }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        set -ex
+        export PATH=/composer-python:$PATH
+        python -m pip install --upgrade 'pip<23' wheel
+        python -m pip install --upgrade .[dev]
+    - name: Run Tests
+      id: tests
+      run: |
+        set -ex
+        export PATH=/composer-python:$PATH
+        export COMMON_ARGS="-v --durations=20 -m '${{ inputs.pytest-markers }}'"
+        # Necessary to run git diff for doctests
+        git config --global --add safe.directory /__w/llm-foundry/llm-foundry
+        make test PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS --codeblocks"
+        # make test-dist PYTEST='${{ inputs.pytest-command }}' EXTRA_ARGS="$COMMON_ARGS" WORLD_SIZE=2
+        python -m coverage combine
+    - uses: actions/upload-artifact@v3
+      with:
+        name: coverage-${{ github.sha }}-${{ inputs.name }}
+        path: .coverage

.github/workflows/pytest-gpu.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+name: Pytest GPU
+on:
+  workflow_call:
+    inputs:
+      container:
+        required: true
+        type: string
+      mcloud-timeout:
+        required: false
+        type: number
+        default: 1800
+      name:
+        required: true
+        type: string
+      pytest-command:
+        required: true
+        type: string
+      pytest-markers:
+        required: true
+        type: string
+      python-version:
+        required: false
+        type: string
+        default: 3.9
+    secrets:
+      mcloud-api-key:
+        required: true
+jobs:
+  pytest-gpu:
+    timeout-minutes: 60 # ${{ inputs.gha-timeout }} for some reason not able to turn this into an input
+    runs-on: ubuntu-latest
+    env:
+      MOSAICML_API_KEY: ${{ secrets.mcloud-api-key }}
+    steps:
+    - name: Checkout Repo
+      uses: actions/checkout@v3
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+    - name: Cache pip
+      uses: actions/cache@v3
+      with:
+        # This path is specific to Ubuntu
+        path: ~/.cache/pip
+        # Look to see if there is a cache hit for the corresponding requirements file
+        key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+          ${{ runner.os }}-
+    - name: Setup MCLI
+      run: |
+        set -ex
+        python -m pip install mosaicml-cli
+        mcli init --mcloud
+        mcli version
+    - name: Submit Run
+      id: tests
+      run: |
+        set -ex
+        PR_NUMBER="$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH")"
+        REF_ARGS=""
+        # Use the PR number if it exists, commit SHA for protected branches and the branch name otherwise
+        if [ -z "$PR_NUMBER" ] || [ "$PR_NUMBER" = "null" ]; then
+          if [[ "$GITHUB_REF" =~ "refs/heads/main" || "$GITHUB_REF" =~ "refs/heads/release" ]]; then
+            REF_ARGS="--git_commit $GITHUB_SHA"
+          else
+            REF_ARGS="--git_branch $GITHUB_REF_NAME"
+          fi
+        else
+          REF_ARGS="--pr_number $PR_NUMBER"
+        fi
+        python .github/mcp/mcp_pytest.py \
+              --image '${{ inputs.container }}' \
+              --pytest_markers '${{ inputs.pytest-markers }}' \
+              --pytest_command '${{ inputs.pytest-command }}' \
+              --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}

.github/workflows/python-publish.yml ADDED Viewed

	@@ -0,0 +1,39 @@

+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+name: Upload Python Package
+on:
+  release:
+    types: [published]
+permissions:
+  contents: read
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}

.github/workflows/release.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Release
+on:
+  push:
+    tags:
+    - 'v*'
+  workflow_dispatch:
+jobs:
+  code-quality:
+    uses: ./.github/workflows/code-quality.yaml
+  pypi-packaging:
+    name: Build and Publish llm-foundry PyPI Package
+    needs:
+    - code-quality
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.9'
+    - name: Build source and wheel distributions
+      run: |
+        if [[ "${{ github.ref }}" =~ refs\/tags\/v ]]; then
+          PYPI_PACKAGE_NAME="llm-foundry"
+        else
+          PYPI_PACKAGE_NAME="llm-foundry-test-$(date +%Y%m%d%H%M%S)"
+        fi
+        # Remove the peft, xentropy-cuda-lib and triton-pre-mlir dependencies as PyPI does not
+        # support direct installs. The error message for importing PEFT, FusedCrossEntropy,
+        # and flash_attn_triton gives instructions on how to install if a user tries to use it
+        # without this dependency.
+        sed '/xentropy-cuda-lib@git+https:\/\/github.com\/HazyResearch\/flash-attention.git@.*/d' -i setup.py
+        sed '/triton-pre-mlir@git+https:\/\/github.com\/vchiley\/triton.git@.*/d' -i setup.py
+        sed '/peft@git+https:\/\/github.com\/huggingface\/peft.git.*/d' -i setup.py
+        python -m pip install --upgrade build twine
+        python -m build
+        twine check --strict dist/*
+    - name: Publish 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      if: contains(github.ref, 'refs/tags/v')
+      with:
+        user: __token__
+        password: ${{ secrets.PROD_PYPI_API_TOKEN }}
+    - name: Publish distribution 📦 to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      if: contains(github.ref, 'refs/heads/') || contains(github.ref, 'refs/pull/')
+      with:
+        user: __token__
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository_url: https://test.pypi.org/legacy/

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .DS_Store
2	+ dist

Andromeda/README.md CHANGED Viewed

@@ -1,216 +1,121 @@
-[![Multi-Modality](agorabanner.png)](https://discord.gg/qUtxnK2NMf)
-# Andromeda: Ultra-Fast and Ultra-Intelligent SOTA Language Model 🚀🌌
-![Andromeda Next Generation Open Source Language Model](images/andromeda-banner.png)
-<div align="center">
-[![Open Bounties](https://img.shields.io/endpoint?url=https%3A%2F%2Fconsole.algora.io%2Fapi%2Fshields%2Fkyegomez%2Fbounties%3Fstatus%3Dopen)](https://console.algora.io/org/kyegomez/bounties?status=open)
-[![Rewarded Bounties](https://img.shields.io/endpoint?url=https%3A%2F%2Fconsole.algora.io%2Fapi%2Fshields%2Fkyegomez%2Fbounties%3Fstatus%3Dcompleted)](https://console.algora.io/org/kyegomez/bounties?status=completed)
-[![GitHub issues](https://img.shields.io/github/issues/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/issues)
-[![GitHub forks](https://img.shields.io/github/forks/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/network)
-[![GitHub stars](https://img.shields.io/github/stars/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/stargazers)
-[![GitHub license](https://img.shields.io/github/license/kyegomez/Andromeda)](https://github.com/kyegomez/Andromeda/blob/main/LICENSE)
-[![Share on Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Share%20%40kyegomez/Andromeda)](https://twitter.com/intent/tweet?text=Check%20out%20this%20amazing%20AI%20project:%20Andromeda&url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda)
-[![Share on Facebook](https://img.shields.io/badge/Share-%20facebook-blue)](https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda)
-[![Share on LinkedIn](https://img.shields.io/badge/Share-%20linkedin-blue)](https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&title=&summary=&source=)
-![Discord](https://img.shields.io/discord/999382051935506503)
-[![Share on Reddit](https://img.shields.io/badge/-Share%20on%20Reddit-orange)](https://www.reddit.com/submit?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&title=Andromeda%20-%20the%20next%20generation%20AI%20shields)
-[![Share on Hacker News](https://img.shields.io/badge/-Share%20on%20Hacker%20News-orange)](https://news.ycombinator.com/submitlink?u=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&t=Andromeda%20-%20the%20next%20generation%20AI%20shields)
-[![Share on Pinterest](https://img.shields.io/badge/-Share%20on%20Pinterest-red)](https://pinterest.com/pin/create/button/?url=https%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda&media=https%3A%2F%2Fexample.com%2Fimage.jpg&description=Andromeda%20-%20the%20next%20generation%20AI%20shields)
-[![Share on WhatsApp](https://img.shields.io/badge/-Share%20on%20WhatsApp-green)](https://api.whatsapp.com/send?text=Check%20out%20Andromeda%20-%20the%20next%20generation%20AI%20shields%20%23Andromeda%20%23AI%0A%0Ahttps%3A%2F%2Fgithub.com%2Fkyegomez%2FAndromeda)
-</div>
-Welcome to Andromeda, The Fastest, Most Creative, and Reliable Language Model Ever Built, train your own verison, conduct inference, and finetune your own verison with simple plug in and play scripts get started in 10 seconds, and:
-- 💼 Handle Ultra Long Sequences (32,000-200,000+ context lengths)
-- ⚡ Ultra Fast Processing (32,000+ tokens in under 100ms)
-- 🎓 Superior Reasoning Capabilities
----
-## 🔄 Updates
-- [READY FOR TRAINING, help us with the strategy!](https://www.figma.com/file/pfaU8Nhyw0EdXuT6z4Hutw/Andromeda-Strategy?type=whiteboard&node-id=0%3A1&t=Tub1wIzaPAXt2i86-1)
-- [And, here is the WANDB link to watch Andromeda train live!](https://wandb.ai/apacai/Andromeda/overview?)
----
-## Appreciation
-* All the creators in Agora, [Join Agora](https://discord.gg/qUtxnK2NMf) the community of AI engineers changing the world with their creations.
-* LucidRains for inspiring me to devote myself to open source AI
------
-## Hiring
-We're hiring: Engineers, Researchers, Interns, And, Customer Success Professionals to work on democratizing Andromeda, email me at with your story `kye@apac.ai`
-----------
-## 💻 Usage
-There are two methods to use Andromeda
-1. `pip install TheBestLLMEver`
-2. `git clone https://github.com/kyegomez/Andromeda.git`
-For detailed instructions, refer to the [Training SOP](DOCs/TRAINING.md) and [Documentation](https://github.com/kyegomez/Andromeda/blob/master/DOCs/DOCUMENTATION.md).
-### Method 1
-To get started:
-1. Clone the repository and install the required packages:
-```bash
-git clone https://github.com/kyegomez/Andromeda
-cd Andromeda
-pip3 install -r requirements.txt
-cd Andromeda
-python3 train.py
 ```
-For further instructions, refer to the [Training SOP](DOCs/TRAINING.md).
----
-## 📚 Training
-1. Set the environment variables:
-   - `ENTITY_NAME`: Your wandb project name
-   - `OUTPUT_DIR`: Directory to save the weights (e.g., `./weights`)
-   - `MASTER_ADDR`: For distributed training
-   - `MASTER_PORT` For master port distributed training
-   - `RANK`- Number of nodes services
-   - `WORLD_SIZE` Number of gpus
-2. Configure the training:
-   - Accelerate Config
-   - Enable Deepspeed 3
-   - Accelerate launch train_distributed_accelerate.py
-For more information, refer to the [Training SOP](DOCs/TRAINING.md).
----
-## 🗃️ Dataset Building
-To preprocess a
- different dataset similar to the C4 or Falcon dataset used during training, use the `build_dataset.py` script. This script pre-tokenizes the data, chunks it into blocks of a specified sequence length, and uploads it to the Huggingface hub.
-Example command:
-```bash
-python3 Andromeda/build_dataset.py --seed 42 --seq_len 8192 --hf_account "HUGGINGFACE APIKEY" --tokenizer "EleutherAI/gpt-neox-20b" --dataset_name "EleutherAI/the_pile_deduplicated"
-```
----
-## 🚀 Why Andromeda?
-Andromeda offers several advantages:
-- Andromeda offers reliable processing of 100,000+ sequence lengths extremely fast under 300ms
-- Andromeda's dataset strategy was crafted with atomic precision and attention to detail for creativity and quantitative reasoning.
-- Andromeda is extremely intelligent with the ability to think like a poet or make API Calls to your favorite apps.
-For detailed information about the model architecture and methods, refer to the [Model Architecture](DOCs/MODEL_ARCHITECTURE.md) documentation.
----
-# 🎯 Andromeda Principles
-- **Efficiency**: Optimize with techniques like attention flashing, rotary position encodings, and deep normalization.
-- **Flexibility**: Adapt to various tasks and domains for wide applications.
-- **Scalability**: Designed to scale with resources and data sizes.
-- **Community-Driven**: Thrives on contributions from the open-source community.
----
-## 🚀 Get Involved
-We're just at the beginning of our journey. As we continue to develop and refine Andromeda, we invite you to join us. Whether you're a developer, researcher, or simply an enthusiast, your insights and contributions can help shape the future of Andromeda.
----
-# 🤝 Contributing to Andromeda
-We are thrilled to invite you to be a part of the Andromeda project. This is not just an open-source project but a community initiative, and we value your expertise and creativity. To show our appreciation, we have instituted a unique rewards system that directly compensates contributors from the revenue generated by the Andromeda API.
-## 🌟 Why Contribute
-Contributing to Andromeda not only enhances your skills and profile but also comes with financial rewards. When you contribute code, documentation, or any form of improvement to the Andromeda project, you are adding value. As such, we believe it's only fair that you share in the rewards.
-## 💰 Rewards Program
-Here's how the Andromeda Rewards Program works:
-1. **Submit a Pull Request:** This can be a code enhancement, bug fix, documentation update, new feature, or any improvement to the project.
-2. **Review and Approval:** Our team will review your contribution. If it gets approved and merged, you become eligible for the rewards program.
-3. **Revenue Share:** Once your pull request is merged, you will receive a percentage of the revenue generated by the Andromeda API. The percentage will be determined based on the significance and impact of your contribution.
-This means you're not just contributing to an open-source project; you're becoming a part of the Andromeda ecosystem. Your efforts can yield ongoing benefits as the Andromeda API grows and evolves.
-## 🚀 Becoming a Paid API
-As part of our growth strategy, we will be deploying Andromeda as a Paid API. The revenue generated from this API will not only sustain and further the project but also fund the rewards program. If you contribute anything to make Andromeda, you will receive recurring revenue from paid API requests!
-## 🚀 How to Start Contributing
-If you're ready to become a part of Andromeda and contribute to the future of multimodal embeddings, here's what you need to do:
-1. Fork the repository.
-2. Make your improvements or additions in your forked repository.
-3. Submit a pull request detailing the changes you've made.
-4. Our team will review your submission. If it's approved, it will be merged into the main repository, and you will become part of the Andromeda Rewards Program.
-Thank you for considering contributing to Andromeda. Your expertise and commitment to this project are what make it thrive. Let's build the future of multimodal embeddings together.
----
-## 🗺️ Roadmap
-1. **Training phase**: Train Andromeda on a large-scale dataset to achieve SOTA performance in various natural language processing tasks.
-2. **World-class inference infrastructure**: Establish a robust and efficient infrastructure that leverages techniques such as:
-   - Model quantization: Reduce memory and computational requirements without significant loss in performance.
-   - Distillation: Train smaller, faster models that retain the knowledge of the larger model.
-   - Optimized serving frameworks: Deploy Andromeda using efficient serving frameworks, such as NVIDIA Triton or TensorFlow Serving, for rapid inference.
-3. **Continuous improvement**: Continuously fine-tune Andromeda on diverse data sources and adapt it to new tasks and domains.
-4. **Community-driven development**: Encourage open-source contributions, including pre-processing improvements, advanced training techniques, and novel use cases.
----
-## 📈 Benchmarks
-### Speed
-- Andromeda utilizes one of the most reliable Attentions ever, flash attention 2.0 Triton. It consumes 50x less memory than GPT-3 and 10x less than LLAMA.
-![AndromedaBanner](images/andromeda_performance.png)
-- We can speed this up even more with dynamic sparse flash attention 2.0.
----
-# 🔮 Join the Journey
-We're just getting started, and we invite you to join the journey. Let's revolutionize the NLP landscape together! 🚀🌟
-- Join Agora and work with 2,000+ AI Engineers to implement all new features.
-- Provide compute and help train Andromeda.
-- Share the message on how we're liberating this superintelligent AI and seizing the power from the corrupt, providing it back to you.

+# Transformer Model Technical Research Analysis
+This document provides an analysis of the hyperparameters and configurations of the given Transformer model, focusing on dimensions, depth, and heads, as well as an architectural overview of their meanings and use cases.
+## Model Configuration
+```python
+model = Transformer(
+    num_tokens=20000,
+    max_seq_len=8192,
+    use_abs_pos_emb = False,
+    attn_layers = Decoder(
+        dim=512,
+        depth=6,
+        heads=8,
+        alibi_pos_bias=True,
+        alibi_num_heads=4,
+        rotary_xpos=True,
+        attn_flash = True,
+        deepnorm=True,
+        shift_tokens=1,
+        attn_one_kv_head = True,
+    )
+)
 ```
+### Hyperparameters
+1. **num_tokens**: The number of unique tokens in the input vocabulary. In this case, the model is configured to handle 20,000 unique tokens.
+2. **max_seq_len**: The maximum sequence length that the model can handle. The current configuration supports sequences of up to 8,192 tokens.
+3. **use_abs_pos_emb**: A boolean flag indicating whether to use absolute positional embeddings. The model is configured not to use absolute positional embeddings (`False`).
+4. **dim**: The dimensionality of the input embeddings and the internal representations within the Transformer layers. The model uses a dimensionality of 512.
+5. **depth**: The number of Transformer layers (or blocks) in the model. This model has a depth of 6, meaning it has 6 layers.
+6. **heads**: The number of attention heads in the multi-head self-attention mechanism. This model uses 8 attention heads.
+### Additional Configurations
+- **alibi_pos_bias**: A boolean flag indicating whether to use the Alibi position bias mechanism. The model is configured to use Alibi position bias (`True`).
+- **alibi_num_heads**: The number of Alibi attention heads to use. The model is configured to use 4 Alibi attention heads.
+- **rotary_xpos**: A boolean flag indicating whether to use the rotary positional encoding mechanism. The model is configured to use rotary positional encoding (`True`).
+- **attn_flash**: A boolean flag indicating whether to use the Flash attention mechanism. The model is configured to use Flash attention (`True`).
+- **deepnorm**: A boolean flag indicating whether to use deep normalization. The model is configured to use deep normalization (`True`).
+- **shift_tokens**: The number of tokens to shift during training to form the target sequence. The model is configured to shift by 1 token (`1`).
+- **attn_one_kv_head**: A boolean flag indicating whether to use one key-value head for attention instead of multiple heads. The model is configured to use one key-value head (`True`).
+## Architectural Overview
+### Dimensions
+- **Input Embedding Dimension (dim)**: This hyperparameter defines the size of the input embeddings and the internal representations within the Transformer layers. A larger dimensionality can capture more complex relationships between tokens but may require more computational resources.
+### Depth
+- **Number of Transformer Layers (depth)**: This hyperparameter defines the number of Transformer layers (or blocks) in the model. Each layer consists of a multi-head self-attention mechanism followed by a position-wise feed-forward network. Increasing the depth allows the model to capture more complex and hierarchical relationships between tokens but may also increase the risk of overfitting and require more computational resources.
+### Heads
+- **Number of Attention Heads (heads)**: This hyperparameter defines the number of attention heads in the multi-head self-attention mechanism. Each head processes the input sequence independently and captures different aspects of the relationships between tokens. The outputs of all heads are then concatenated and transformed to produce the final output. Increasing the number of attention heads can help the model capture more diverse and fine-grained relationships between tokens but may also increase computational complexity and memory requirements.
+## Benefits and Consequences of Increasing Hyperparameters
+### Dimensions
+**Benefits:**
+- Better representation: Increasing the dimensionality of the input embeddings and internal representations allows the model to capture more complex relationships between tokens.
+- Improved model expressiveness: A higher dimensionality may enable the model to learn more expressive features, leading to better performance on complex tasks.
+**Consequences:**
+- Computational complexity: Increasing the dimensionality will increase the computational complexity of the model, which may lead to longer training and inference times.
+- Memory requirements: A higher dimensionality will increase the memory requirements of the model, potentially limiting its applicability on resource-constrained hardware.
+- Risk of overfitting: Models with a higher dimensionality may be more prone to overfitting, especially if the size of the training dataset is small.
+### Depth
+**Benefits:**
+- Hierarchical representation: Increasing the depth of the model allows it to capture more complex and hierarchical relationships between tokens, which can lead to improved performance on tasks that require understanding long-range dependencies.
+- Enhanced feature extraction: Deeper models can extract features at different levels of abstraction, potentially improving their ability to generalize to new data.
+**Consequences:**
+- Computational complexity: Increasing the depth will increase the computational complexity of the model, leading to longer training and inference times.
+- Memory requirements: A deeper model will require more memory, potentially limiting its applicability on resource-constrained hardware.
+- Risk of overfitting: Deeper models may be more prone to overfitting, especially if the size of the training dataset is small.
+- Vanishing/exploding gradients: Deeper models may suffer from vanishing or exploding gradients during training, making it harder to optimize the model. Techniques such as layer normalization or skip connections can help mitigate this issue.
+### Heads
+**Benefits:**
+- Diverse attention: Increasing the number of attention heads allows the model to capture more diverse and fine-grained relationships between tokens, which can improve its ability to understand the input data.
+- Robustness: Multi-head attention can make the model more robust, as each head can focus on different aspects of the input data.
+**Consequences:**
+- Computational complexity: Increasing the number of attention heads will increase the computational complexity of the model, leading to longer training and inference times.
+- Memory requirements: A model with more attention heads will require more memory, potentially limiting its applicability on resource-constrained hardware.
+- Diminishing returns: There may be diminishing returns when increasing the number of attention heads beyond a certain point, as the model may already be capturing most of the relevant information with fewer heads.

Andromeda/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# from Andromeda.train import Train
+from Andromeda.model import AndromedaTokenizer, Andromeda
+from Andromeda.train import Train, train

Andromeda/configs.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from Andromeda.model import AndromedaEmbedding, Andromeda
+Andromeda1Billion = Andromeda(
+    num_tokens=25000,
+    max_seq_len=4192,
+    dim=2048,
+    depth=16,
+    dim_head=128,
+    heads=8,
+    use_abs_pos_emb=False,
+    alibi_pos_bias=True,
+    alibi_num_heads=4,
+    rotary_xpos=True,
+    attn_flash=True,
+    # shift_tokens=1,
+    attn_one_kv_head=True,
+    qk_norm=True,
+    attn_qk_norm=True,
+    attn_qk_norm_dim_scale=True,
+    embedding_provider=AndromedaEmbedding()
+)
+Andromeda3Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=3072,
+    depth=24,
+    dim_head=128,
+    heads=12,
+    use_abs_pos_emb=False,
+    alibi_pos_bias=True,
+    alibi_num_heads=6,
+    rotary_xpos=True,
+    attn_flash=True,
+    shift_tokens=1,
+    attn_one_kv_head=True,
+    qk_norm=True,
+    attn_qk_norm=True,
+    attn_qk_norm_dim_scale=True,
+    embedding_provider=AndromedaEmbedding()
+)
+Andromeda7Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=4096,
+    depth=32,
+    dim_head=128,
+    heads=16,
+    use_abs_pos_emb=False,
+    alibi_pos_bias=True,
+    alibi_num_heads=8,
+    rotary_xpos=True,
+    attn_flash=True,
+    shift_tokens=1,
+    attn_one_kv_head=True,
+    qk_norm=True,
+    attn_qk_norm=True,
+    attn_qk_norm_dim_scale=True,
+    embedding_provider=AndromedaEmbedding()
+)
+Andromeda10Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=5120,
+    depth=32,
+    dim_head=128,
+    heads=20,
+    use_abs_pos_emb=False,
+    alibi_pos_bias=True,
+    alibi_num_heads=4,
+    rotary_xpos=True,
+    attn_flash=True,
+    shift_tokens=1,
+    attn_one_kv_head=True,
+    qk_norm=True,
+    attn_qk_norm=True,
+    attn_qk_norm_dim_scale=True,
+    embedding_provider=AndromedaEmbedding()
+)
+Andromeda15Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=6144,
+    depth=40,
+    dim_head=128,
+    heads=24,
+    use_abs_pos_emb=False,
+    alibi_pos_bias=True,
+    alibi_num_heads=4,
+    rotary_xpos=True,
+    attn_flash=True,
+    shift_tokens=1,
+    attn_one_kv_head=True,
+    qk_norm=True,
+    attn_qk_norm=True,
+    attn_qk_norm_dim_scale=True,
+    embedding_provider=AndromedaEmbedding()
+)
+Andromeda20Billion = Andromeda(
+    num_tokens=50432,
+    max_seq_len=8192,
+    dim=7168,
+    depth=48,
+    dim_head=128,
+    heads=28,
+    use_abs_pos_emb=False,
+    alibi_pos_bias=True,
+    alibi_num_heads=4,
+    rotary_xpos=True,
+    attn_flash=True,
+    shift_tokens=1,
+    attn_one_kv_head=True,
+    qk_norm=True,
+    attn_qk_norm=True,
+    attn_qk_norm_dim_scale=True,
+    embedding_provider=AndromedaEmbedding()
+)
+#to GPT like 176Billion Parameters 122888 dimension, 96 depth, 96 heads, attn dim head 128

Andromeda/core/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+from packaging import version
+if version.parse(torch.__version__) >= version.parse('2.0.0'):
+    from einops._torch_specific import allow_ops_in_compiled_graph
+    allow_ops_in_compiled_graph()

Andromeda/core/attend.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from functools import partial
+import torch
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+from collections import namedtuple
+from functools import wraps
+from packaging import version
+from dataclasses import dataclass
+from einops import rearrange
+from Andromeda.core.flash import attention
+# from flash import FlashAttention
+# constants
+EfficientAttentionConfig = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
+@dataclass
+class Intermediates:
+    qk_similarities: Tensor = None
+    pre_softmax_attn: Tensor = None
+    post_softmax_attn: Tensor = None
+# helpers
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+# main class
+class Attend(nn.Module):
+    def __init__(
+        self,
+        *,
+        dropout = 0.,
+        causal = False,
+        heads = None,
+        talking_heads = False,
+        scale = None,
+        qk_norm = False,
+        flash = False,
+        triton = False,
+    ):
+        super().__init__()
+        self.scale = scale
+        self.qk_norm = qk_norm
+        self.causal = causal
+        self.attn_fn = partial(F.softmax, dtype = torch.float32) if not qk_norm else F.softmax
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        # talking heads
+        assert not (flash and talking_heads), 'talking heads not compatible with flash attention'
+        self.talking_heads = talking_heads
+        if talking_heads:
+            self.pre_softmax_talking_heads = nn.Conv2d(heads, heads, 1, bias = False)
+            self.post_softmax_talking_heads = nn.Conv2d(heads, heads, 1, bias = False)
+        # flash attention
+        self.flash = flash
+        assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
+        # determine efficient attention configs for cuda and cpu
+        self.cpu_config = EfficientAttentionConfig(True, True, True)
+        self.cuda_config = None
+        if not torch.cuda.is_available() or not flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
+        if device_properties.major == 8 and device_properties.minor == 0:
+            print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
+            self.cuda_config = EfficientAttentionConfig(True, False, False)
+        else:
+            print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
+            self.cuda_config = EfficientAttentionConfig(False, True, True)
+    def flash_attn(
+        self,
+        q, k, v,
+        mask = None,
+        attn_bias = None
+    ):
+        batch, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
+        # Recommended for multi-query single-key-value attention by Tri Dao
+        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
+        if k.ndim == 3:
+            k = rearrange(k, 'b ... -> b 1 ...').expand_as(q)
+        if v.ndim == 3:
+            v = rearrange(v, 'b ... -> b 1 ...').expand_as(q)
+        # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
+        if self.qk_norm:
+            default_scale = q.shape[-1] ** -0.5
+            q = q * (default_scale / self.scale)
+        # Check if mask exists and expand to compatible shape
+        # The mask is B L, so it would have to be expanded to B H N L
+        causal = self.causal
+        if exists(mask):
+            assert mask.ndim == 4
+            mask = mask.expand(batch, heads, q_len, k_len)
+            # manually handle causal mask, if another mask was given
+            if causal:
+                causal_mask = torch.ones((q_len, k_len), dtype = torch.bool, device = device).triu(k_len - q_len + 1)
+                mask = mask | causal_mask
+                causal = False
+        # handle alibi positional bias
+        # convert from bool to float
+        if exists(attn_bias):
+            attn_bias = rearrange(attn_bias, 'h i j -> 1 h i j').expand(batch, -1, -1, -1)
+            # if mask given, the mask would already contain the causal mask from above logic
+            # otherwise, if no mask given but still causal, mask out alibi positional bias to a large negative number
+            mask_value = -torch.finfo(q.dtype).max
+            if exists(mask):
+                attn_bias = attn_bias.masked_fill(mask, mask_value // 2)
+            elif causal:
+                causal_mask = torch.ones((q_len, k_len), dtype = torch.bool, device = device).triu(k_len - q_len + 1)
+                attn_bias = attn_bias.masked_fill(causal_mask, mask_value // 2)
+                causal = False
+            # scaled_dot_product_attention handles attn_mask either as bool or additive bias
+            # make it an additive bias here
+            mask = attn_bias
+        # Check if there is a compatible device for flash attention
+        config = self.cuda_config if is_cuda else self.cpu_config
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+        with torch.backends.cuda.sdp_kernel(**config._asdict()):
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask = mask,
+                dropout_p = self.dropout if self.training else 0.,
+                is_causal = causal
+            )
+        return out, Intermediates()
+    def forward(
+        self,
+        q, k, v,
+        mask = None,
+        attn_bias = None,
+        prev_attn = None
+    ):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        n, device = q.shape[-2], q.device
+        scale = default(self.scale, q.shape[-1] ** -0.5)
+        if self.flash:
+            assert not exists(prev_attn), 'residual attention not compatible with flash attention'
+            return self.flash_attn(q, k, v, mask = mask, attn_bias = attn_bias)
+            # return FlashAttention(q, k, v, mask=mask, attn_bias=attn_bias )
+        if self.triton:
+            return attention(q, k, v, self.casual, scale)
+        kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d'
+        dots = einsum(f'b h i d, {kv_einsum_eq} -> b h i j', q, k) * scale
+        if exists(prev_attn):
+            dots = dots + prev_attn
+        qk_similarities = dots.clone()
+        if self.talking_heads:
+            dots = self.pre_softmax_talking_heads(dots)
+        if exists(attn_bias):
+            dots = dots + attn_bias
+        dtype = dots.dtype
+        pre_softmax_attn = dots.clone()
+        mask_value = -torch.finfo(dots.dtype).max
+        if exists(mask):
+            dots = dots.masked_fill(mask, mask_value)
+        if self.causal:
+            i, j = dots.shape[-2:]
+            causal_mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
+            dots = dots.masked_fill(causal_mask, mask_value)
+        attn = self.attn_fn(dots, dim = -1)
+        attn = attn.type(dtype)
+        post_softmax_attn = attn.clone()
+        attn = self.attn_dropout(attn)
+        if self.talking_heads:
+            attn = self.post_softmax_talking_heads(attn)
+        out = einsum(f'b h i j, {kv_einsum_eq} -> b h i d', attn, v)
+        intermediates = Intermediates(
+            qk_similarities = qk_similarities,
+            pre_softmax_attn = pre_softmax_attn,
+            post_softmax_attn = post_softmax_attn
+        )
+        return out, intermediates

Andromeda/core/autoregressive_wrapper.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from math import ceil
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange, pack, unpack
+def exists(val):
+    return val is not None
+def eval_decorator(fn):
+    def inner(self, *args, **kwargs):
+        was_training = self.training
+        self.eval()
+        out = fn(self, *args, **kwargs)
+        self.train(was_training)
+        return out
+    return inner
+# nucleus
+def top_p(logits, thres = 0.9):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cum_probs > (1 - thres)
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+    sorted_logits[sorted_indices_to_remove] = float('-inf')
+    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
+# topk
+def top_k(logits, thres = 0.9):
+    k = ceil((1 - thres) * logits.shape[-1])
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+# top_a
+def top_a(logits, min_p_pow=2.0, min_p_ratio=0.02):
+    probs = F.softmax(logits, dim=-1)
+    limit = torch.pow(torch.max(probs), min_p_pow) * min_p_ratio
+    logits[probs < limit] = float('-inf')
+    logits[probs >= limit] = 1
+    return logits
+# autoregressive wrapper class
+class AutoregressiveWrapper(nn.Module):
+    def __init__(
+        self,
+        net,
+        ignore_index = -100,
+        pad_value = 0,
+        mask_prob = 0.
+    ):
+        super().__init__()
+        self.pad_value = pad_value
+        self.ignore_index = ignore_index
+        self.net = net
+        self.max_seq_len = net.max_seq_len
+        # paper shows masking (MLM) in conjunction with autoregressive decoder-only training leads to big improvements https://arxiv.org/abs/2210.13432
+        assert mask_prob < 1.
+        self.mask_prob = mask_prob
+    @torch.no_grad()
+    @eval_decorator
+    def generate(
+        self,
+        start_tokens,
+        seq_len,
+        eos_token = None,
+        temperature = 1.,
+        filter_logits_fn = top_k,
+        filter_thres = 0.9,
+        min_p_pow = 2.0,
+        min_p_ratio = 0.02,
+        **kwargs
+    ):
+        start_tokens, ps = pack([start_tokens], '* n')
+        b, t = start_tokens.shape
+        out = start_tokens
+        for _ in range(seq_len):
+            x = out[:, -self.max_seq_len:]
+            logits = self.net(x, **kwargs)[:, -1]
+            if filter_logits_fn in {top_k, top_p}:
+                filtered_logits = filter_logits_fn(logits, thres = filter_thres)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+            elif filter_logits_fn is top_a:
+                filtered_logits = filter_logits_fn(logits, min_p_pow = min_p_pow, min_p_ratio= min_p_ratio)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+            sample = torch.multinomial(probs, 1)
+            out = torch.cat((out, sample), dim=-1)
+            if exists(eos_token):
+                is_eos_tokens = (out == eos_token)
+                if is_eos_tokens.any(dim = -1).all():
+                    # mask out everything after the eos tokens
+                    shifted_is_eos_tokens = F.pad(is_eos_tokens, (1, -1))
+                    mask = shifted_is_eos_tokens.float().cumsum(dim = -1) >= 1
+                    out = out.masked_fill(mask, self.pad_value)
+                    break
+        out = out[:, t:]
+        out, = unpack(out, ps, '* n')
+        return out
+    def forward(self, x, return_loss=True, **kwargs):
+        seq, ignore_index = x.shape[1], self.ignore_index
+        inp, target = x[:, :-1], x[:, 1:]
+        if self.mask_prob > 0.:
+            rand = torch.randn(inp.shape, device = x.device)
+            rand[:, 0] = -torch.finfo(rand.dtype).max # first token should not be masked out
+            num_mask = min(int(seq * self.mask_prob), seq - 1)
+            indices = rand.topk(num_mask, dim = -1).indices
+            mask = ~torch.zeros_like(inp).scatter(1, indices, 1.).bool()
+            kwargs.update(self_attn_context_mask = mask)
+        logits = self.net(inp, **kwargs)
+        loss = F.cross_entropy(
+            rearrange(logits, 'b n c -> b c n'),
+            target,
+            ignore_index = ignore_index
+        )
+        if return_loss:
+            return logits, loss
+        return logits

Andromeda/core/flash.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+@triton.jit
+def _fwd_kernel(
+    Q, K, V, sm_scale,
+    L,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vk, stride_vn,
+    stride_oz, stride_oh, stride_om, stride_on,
+    Z, H, N_CTX,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    qvk_offset = off_hz * stride_qh
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + qvk_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + qvk_offset,
+        shape=(BLOCK_DMODEL, N_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + qvk_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(Q_block_ptr)
+    q = (q * qk_scale).to(tl.float16)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk ---
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        if IS_CAUSAL:
+            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant ---
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(tl.float16), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    # write back l and m
+    acc = acc / l_i[:, None]
+    l_ptrs = L + off_hz * N_CTX + offs_m
+    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+    # write back O
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + qvk_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    tl.store(O_block_ptr, acc.to(tl.float16))
+@triton.jit
+def _bwd_preprocess(
+    Out, DO,
+    Delta,
+    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_n = tl.arange(0, D_HEAD)
+    # load
+    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    # compute
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(Delta + off_m, delta)
+@triton.jit
+def _bwd_kernel(
+    Q, K, V, sm_scale, Out, DO,
+    DQ, DK, DV,
+    L,
+    D,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vk, stride_vn,
+    Z, H, N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    CAUSAL: tl.constexpr,
+):
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    qk_scale = sm_scale * 1.44269504
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_qz + off_h * stride_qh
+    V += off_z * stride_qz + off_h * stride_qh
+    DO += off_z * stride_qz + off_h * stride_qh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DK += off_z * stride_qz + off_h * stride_qh
+    DV += off_z * stride_qz + off_h * stride_qh
+    for start_n in range(0, num_block):
+        if CAUSAL:
+            lo = start_n * BLOCK_M
+        else:
+            lo = 0
+        # initialize row/col offsets
+        offs_qm = lo + tl.arange(0, BLOCK_M)
+        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_m = tl.arange(0, BLOCK_N)
+        offs_k = tl.arange(0, BLOCK_DMODEL)
+        # initialize pointers to value-like data
+        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        # pointer to row-wise quantities in value-like data
+        D_ptrs = D + off_hz * N_CTX
+        l_ptrs = L + off_hz * N_CTX
+        # initialize dv amd dk
+        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        # k and v stay in SRAM throughout
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+        # loop over rows
+        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
+            offs_m_curr = start_m + offs_m
+            # load q, k, v, do on-chip
+            q = tl.load(q_ptrs)
+            # recompute p = softmax(qk, dim=-1).T
+            if CAUSAL:
+                qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), float(0.), float("-inf"))
+            else:
+                qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(k))
+            qk *= qk_scale
+            l_i = tl.load(l_ptrs + offs_m_curr)
+            p = tl.math.exp2(qk - l_i[:, None])
+            # compute dv
+            do = tl.load(do_ptrs)
+            dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
+            # compute dp = dot(v, do)
+            Di = tl.load(D_ptrs + offs_m_curr)
+            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+            dp += tl.dot(do, tl.trans(v))
+            # compute ds = p * (dp - delta[:, None])
+            ds = p * dp * sm_scale
+            # compute dk = dot(ds.T, q)
+            dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
+            # compute dq
+            dq = tl.load(dq_ptrs)
+            dq += tl.dot(ds.to(Q.dtype.element_ty), k)
+            tl.store(dq_ptrs, dq)
+            # increment pointers
+            dq_ptrs += BLOCK_M * stride_qm
+            q_ptrs += BLOCK_M * stride_qm
+            do_ptrs += BLOCK_M * stride_qm
+        # write-back
+        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        tl.store(dv_ptrs, dv)
+        tl.store(dk_ptrs, dk)
+empty = torch.empty(128, device="cuda")
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal, sm_scale):
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        assert Lq == Lk and Lk == Lv
+        assert Lk in {16, 32, 64, 128}
+        o = torch.empty_like(q)
+        BLOCK_M = 128
+        BLOCK_N = 64
+        grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1)
+        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+        num_warps = 4 if Lk <= 64 else 8
+        _fwd_kernel[grid](
+            q, k, v, sm_scale,
+            L,
+            o,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+            q.shape[0], q.shape[1], q.shape[2],
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_DMODEL=Lk,
+            IS_CAUSAL=causal,
+            num_warps=num_warps,
+            num_stages=4)
+        ctx.save_for_backward(q, k, v, o, L)
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = Lk
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        BLOCK = 128
+        q, k, v, o, L = ctx.saved_tensors
+        do = do.contiguous()
+        dq = torch.zeros_like(q, dtype=torch.float32)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        delta = torch.empty_like(L)
+        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
+            o, do,
+            delta,
+            BLOCK_M=BLOCK, D_HEAD=ctx.BLOCK_DMODEL,
+        )
+        _bwd_kernel[(ctx.grid[1],)](
+            q, k, v, ctx.sm_scale,
+            o, do,
+            dq, dk, dv,
+            L, delta,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+            q.shape[0], q.shape[1], q.shape[2],
+            ctx.grid[0],
+            BLOCK_M=BLOCK, BLOCK_N=BLOCK,
+            BLOCK_DMODEL=ctx.BLOCK_DMODEL, num_warps=8,
+            CAUSAL=ctx.causal,
+            num_stages=1,
+        )
+        return dq, dk, dv, None, None
+attention = _attention.apply

Andromeda/core/transformer.py ADDED Viewed

	@@ -0,0 +1,1376 @@

+import math
+from random import random
+import torch
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+from functools import partial, wraps
+from inspect import isfunction
+from dataclasses import dataclass
+from typing import List
+from einops import rearrange, repeat
+from Andromeda.core.attend import Attend, Intermediates
+from Andromeda.core.autoregressive_wrapper import AutoregressiveWrapper
+from abc import ABC, abstractmethod
+# import bitsandbytes as bnb
+# constants
+DEFAULT_DIM_HEAD = 64
+@dataclass
+class LayerIntermediates:
+    hiddens: List[Tensor] = None
+    attn_intermediates: List[Intermediates] = None
+# helpers
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def cast_tuple(val, depth):
+    return val if isinstance(val, tuple) else (val,) * depth
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+class always():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, *args, **kwargs):
+        return self.val
+class not_equals():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return x != self.val
+class equals():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, x, *args, **kwargs):
+        return x == self.val
+# tensor helpers
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+def l2norm(t, groups = 1):
+    t = rearrange(t, '... (g d) -> ... g d', g = groups)
+    t = F.normalize(t, p = 2, dim = -1)
+    return rearrange(t, '... g d -> ... (g d)')
+def pad_at_dim(t, pad, dim = -1, value = 0.):
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value = value)
+def or_reduce(masks):
+    head, *body = masks
+    for rest in body:
+        head = head | rest
+    return head
+# init helpers
+def init_zero_(layer):
+    nn.init.constant_(layer.weight, 0.)
+    if exists(layer.bias):
+        nn.init.constant_(layer.bias, 0.)
+# keyword argument helpers
+def pick_and_pop(keys, d):
+    values = list(map(lambda key: d.pop(key), keys))
+    return dict(zip(keys, values))
+def group_dict_by_key(cond, d):
+    return_val = [dict(),dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return (*return_val,)
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    return kwargs_without_prefix, kwargs
+# initializations
+def deepnorm_init(
+    transformer,
+    beta,
+    module_name_match_list = ['.ff.', '.to_v', '.to_out']
+):
+    for name, module in transformer.named_modules():
+        if type(module) != nn.Linear:
+            continue
+        needs_beta_gain = any(map(lambda substr: substr in name, module_name_match_list))
+        gain = beta if needs_beta_gain else 1
+        nn.init.xavier_normal_(module.weight.data, gain = gain)
+        if exists(module.bias):
+            nn.init.constant_(module.bias.data, 0)
+# structured dropout, more effective than traditional attention dropouts
+def dropout_seq(seq, mask, dropout):
+    b, n, *_, device = *seq.shape, seq.device
+    logits = torch.randn(b, n, device = device)
+    if exists(mask):
+        mask_value = max_neg_value(logits)
+        logits = logits.masked_fill(~mask, mask_value)
+    keep_prob = 1. - dropout
+    num_keep = max(1,  int(keep_prob * n))
+    keep_indices = logits.topk(num_keep, dim = 1).indices
+    batch_indices = torch.arange(b, device = device)
+    batch_indices = rearrange(batch_indices, 'b -> b 1')
+    seq = seq[batch_indices, keep_indices]
+    if exists(mask):
+        seq_counts = mask.sum(dim = -1)
+        seq_keep_counts = torch.ceil(seq_counts * keep_prob).int()
+        keep_mask = torch.arange(num_keep, device = device) < rearrange(seq_keep_counts, 'b -> b 1')
+        mask = mask[batch_indices, keep_indices] & keep_mask
+    return seq, mask
+# activations
+class ReluSquared(nn.Module):
+    def forward(self, x):
+        return F.relu(x) ** 2
+#tokenization
+class BaseTokenizer(ABC):
+    @abstractmethod
+    def tokenize(self, text: str) -> List[int]:
+        pass
+class CustomTokenizer(BaseTokenizer):
+    def tokenize(self, text: str) -> List[int]:
+        # Your custom tokenization algorithm
+        tokens = ...
+        return tokens
+# embedding
+class BaseEmbedding(ABC):
+    @abstractmethod
+    def get_embedding(self, num_tokens: int, dim: int) -> nn.Module:
+        # Custom embedding function or model
+        embedding = ...
+        return embedding
+class AndromedaEmbedding(BaseEmbedding):
+    def get_embedding(self, num_tokens: int, dim: int) -> nn.Module:
+        embedding = nn.Embedding(num_tokens, dim)
+        return embedding
+# class AndromedaBnBEmbedding(BaseEmbedding):
+#     def get_embedding(self, num_tokens: int, dim: int, padding_idx: int = 0) -> bnb.nn.modules:
+#         embedding = bnb.nn.modules.Embedding(num_tokens, dim, padding_idx)
+#         return embedding
+class TokenEmbedding(nn.Module):
+    def __init__(self, dim, num_tokens, embedding_provider: BaseEmbedding, l2norm_embed = False):
+        super().__init__()
+        self.l2norm_embed = l2norm_embed
+        self.emb = embedding_provider.get_embedding(num_tokens, dim)
+        # nn.Embedding(num_tokens, dim)
+    def forward(self, x):
+        token_emb = self.emb(x)
+        return l2norm(token_emb) if self.l2norm_embed else token_emb
+# positional embeddings
+class AbsolutePositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len, l2norm_embed = False):
+        super().__init__()
+        self.scale = dim ** -0.5 if not l2norm_embed else 1.
+        self.max_seq_len = max_seq_len
+        self.l2norm_embed = l2norm_embed
+        self.emb = nn.Embedding(max_seq_len, dim)
+    def forward(self, x, pos = None):
+        seq_len, device = x.shape[1], x.device
+        assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}'
+        if not exists(pos):
+            pos = torch.arange(seq_len, device = device)
+        pos_emb = self.emb(pos)
+        pos_emb = pos_emb * self.scale
+        return l2norm(pos_emb) if self.l2norm_embed else pos_emb
+class ScaledSinusoidalEmbedding(nn.Module):
+    def __init__(self, dim, theta = 10000):
+        super().__init__()
+        assert (dim % 2) == 0
+        self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5)
+        half_dim = dim // 2
+        freq_seq = torch.arange(half_dim).float() / half_dim
+        inv_freq = theta ** -freq_seq
+        self.register_buffer('inv_freq', inv_freq, persistent = False)
+    def forward(self, x, pos = None):
+        seq_len, device = x.shape[1], x.device
+        if not exists(pos):
+            pos = torch.arange(seq_len, device = device)
+        emb = einsum('i, j -> i j', pos, self.inv_freq)
+        emb = torch.cat((emb.sin(), emb.cos()), dim = -1)
+        return emb * self.scale
+class RelativePositionBias(nn.Module):
+    def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 8):
+        super().__init__()
+        self.scale = scale
+        self.causal = causal
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
+        ret = 0
+        n = -relative_position
+        if not causal:
+            num_buckets //= 2
+            ret += (n < 0).long() * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def forward(self, i, j):
+        device = self.device
+        q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
+        k_pos = torch.arange(j, dtype = torch.long, device = device)
+        rel_pos = k_pos[None, :] - q_pos[:, None]
+        rp_bucket = self._relative_position_bucket(rel_pos, causal = self.causal, num_buckets = self.num_buckets, max_distance = self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        bias = rearrange(values, 'i j h -> h i j')
+        return bias * self.scale
+class DynamicPositionBias(nn.Module):
+    def __init__(self, dim, *, heads, depth, log_distance = False, norm = False):
+        super().__init__()
+        assert depth >= 1, 'depth for dynamic position bias MLP must be greater or equal to 1'
+        self.log_distance = log_distance
+        self.mlp = nn.ModuleList([])
+        self.mlp.append(nn.Sequential(
+            nn.Linear(1, dim),
+            nn.LayerNorm(dim) if norm else nn.Identity(),
+            nn.SiLU()
+        ))
+        for _ in range(depth - 1):
+            self.mlp.append(nn.Sequential(
+                nn.Linear(dim, dim),
+                nn.LayerNorm(dim) if norm else nn.Identity(),
+                nn.SiLU()
+            ))
+        self.mlp.append(nn.Linear(dim, heads))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def forward(self, i, j):
+        assert i == j
+        n, device = j, self.device
+        # get the (n x n) matrix of distances
+        seq_arange = torch.arange(n, device = device)
+        context_arange = torch.arange(n, device = device)
+        indices = rearrange(seq_arange, 'i -> i 1') - rearrange(context_arange, 'j -> 1 j')
+        indices += (n - 1)
+        # input to continuous positions MLP
+        pos = torch.arange(-n + 1, n, device = device).float()
+        pos = rearrange(pos, '... -> ... 1')
+        if self.log_distance:
+            pos = torch.sign(pos) * torch.log(pos.abs() + 1)  # log of distance is sign(rel_pos) * log(abs(rel_pos) + 1)
+        for layer in self.mlp:
+            pos = layer(pos)
+        # get position biases
+        bias = pos[indices]
+        bias = rearrange(bias, 'i j h -> h i j')
+        return bias
+class AlibiPositionalBias(nn.Module):
+    def __init__(self, heads, total_heads, **kwargs):
+        super().__init__()
+        self.heads = heads
+        self.total_heads = total_heads
+        slopes = Tensor(self._get_slopes(heads))
+        slopes = rearrange(slopes, 'h -> h 1 1')
+        self.register_buffer('slopes', slopes, persistent = False)
+        self.register_buffer('bias', None, persistent = False)
+    def get_bias(self, i, j, device):
+        i_arange = torch.arange(j - i, j, device = device)
+        j_arange = torch.arange(j, device = device)
+        bias = -torch.abs(rearrange(j_arange, 'j -> 1 1 j') - rearrange(i_arange, 'i -> 1 i 1'))
+        return bias
+    @staticmethod
+    def _get_slopes(heads):
+        def get_slopes_power_of_2(n):
+            start = (2**(-2**-(math.log2(n)-3)))
+            ratio = start
+            return [start*ratio**i for i in range(n)]
+        if math.log2(heads).is_integer():
+            return get_slopes_power_of_2(heads)
+        closest_power_of_2 = 2 ** math.floor(math.log2(heads))
+        return get_slopes_power_of_2(closest_power_of_2) + get_slopes_power_of_2(2 * closest_power_of_2)[0::2][:heads-closest_power_of_2]
+    @property
+    def device(self):
+        return next(self.buffers()).device
+    def forward(self, i, j):
+        h, device = self.total_heads, self.device
+        if exists(self.bias) and self.bias.shape[-1] >= j:
+            return self.bias[..., :i, :j]
+        bias = self.get_bias(i, j, device)
+        bias = bias * self.slopes
+        num_heads_unalibied = h - bias.shape[0]
+        bias = pad_at_dim(bias, (0, num_heads_unalibied), dim = 0)
+        self.register_buffer('bias', bias, persistent = False)
+        return self.bias
+class LearnedAlibiPositionalBias(AlibiPositionalBias):
+    def __init__(self, heads, total_heads):
+        super().__init__(heads, total_heads)
+        log_slopes = torch.log(self.slopes)
+        self.learned_logslopes = nn.Parameter(log_slopes)
+    def forward(self, i, j):
+        h, i, j, device = self.heads, self.device
+        def get_slopes(param):
+            return pad_at_dim(param.exp(), (0, h - param.shape[0]), dim = -2)
+        if exists(self.bias) and self.bias.shape[-1] >= j:
+            bias = self.bias[..., :i, :j]
+        else:
+            bias = self.get_bias(i, j, device)
+            self.register_buffer('bias', bias, persistent = False)
+        slopes = get_slopes(self.learned_logslopes)
+        bias = bias * slopes
+        return bias
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        use_xpos = False,
+        scale_base = 512,
+        interpolation_factor=1.,
+        base=10000,
+        base_rescale_factor=1.
+    ):
+        super().__init__()
+        base *=  base_rescale_factor ** (dim / (dim - 2))
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        if not use_xpos:
+            self.register_buffer('scale', None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = scale_base
+        self.register_buffer('scale', scale)
+    def forward(self, seq_len, device):
+        t = torch.arange(seq_len, device = device).type_as(self.inv_freq)
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
+        freqs = torch.cat((freqs, freqs), dim = -1)
+        if not exists(self.scale):
+            return freqs, 1.
+        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, 'n -> n 1')
+        scale = torch.cat((scale, scale), dim = -1)
+        return freqs, scale
+def rotate_half(x):
+    x = rearrange(x, '... (j d) -> ... j d', j = 2)
+    x1, x2 = x.unbind(dim = -2)
+    return torch.cat((-x2, x1), dim = -1)
+def apply_rotary_pos_emb(t, freqs, scale = 1):
+    seq_len = t.shape[-2]
+    freqs = freqs[-seq_len:, :]
+    return (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+# norms
+class Scale(nn.Module):
+    def __init__(self, value, fn):
+        super().__init__()
+        self.value = value
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        out = self.fn(x, **kwargs)
+        def scale_fn(t):
+            return t * self.value
+        if not isinstance(out, tuple):
+            return scale_fn(out)
+        return (scale_fn(out[0]), *out[1:])
+class ScaleNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1) * (dim ** -0.5))
+    def forward(self, x):
+        norm = torch.norm(x, dim = -1, keepdim = True)
+        return x / norm.clamp(min = self.eps) * self.g
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-8):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        norm = torch.norm(x, dim = -1, keepdim = True) * self.scale
+        return x / norm.clamp(min = self.eps) * self.g
+# residual and residual gates
+class Residual(nn.Module):
+    def __init__(self, dim, scale_residual = False, scale_residual_constant = 1.):
+        super().__init__()
+        self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+        self.scale_residual_constant = scale_residual_constant
+    def forward(self, x, residual):
+        if exists(self.residual_scale):
+            residual = residual * self.residual_scale
+        if self.scale_residual_constant != 1:
+            residual = residual * self.scale_residual_constant
+        return x + residual
+class GRUGating(nn.Module):
+    def __init__(self, dim, scale_residual = False, **kwargs):
+        super().__init__()
+        self.gru = nn.GRUCell(dim, dim)
+        self.residual_scale = nn.Parameter(torch.ones(dim)) if scale_residual else None
+    def forward(self, x, residual):
+        if exists(self.residual_scale):
+            residual = residual * self.residual_scale
+        gated_output = self.gru(
+            rearrange(x, 'b n d -> (b n) d'),
+            rearrange(residual, 'b n d -> (b n) d')
+        )
+        return gated_output.reshape_as(x)
+# token shifting
+def shift(t, amount, mask = None):
+    if amount == 0:
+        return t
+    else:
+        amount = min(amount, t.shape[1])
+    if exists(mask):
+        t = t.masked_fill(~mask[..., None], 0.)
+    return pad_at_dim(t, (amount, -amount), dim = - 2, value = 0.)
+class ShiftTokens(nn.Module):
+    def __init__(self, shifts, fn):
+        super().__init__()
+        self.fn = fn
+        self.shifts = tuple(shifts)
+    def forward(self, x, **kwargs):
+        mask = kwargs.get('mask', None)
+        shifts = self.shifts
+        segments = len(shifts)
+        feats_per_shift = x.shape[-1] // segments
+        splitted = x.split(feats_per_shift, dim = -1)
+        segments_to_shift, rest = splitted[:segments], splitted[segments:]
+        segments_to_shift = list(map(lambda args: shift(*args, mask = mask), zip(segments_to_shift, shifts)))
+        x = torch.cat((*segments_to_shift, *rest), dim = -1)
+        return self.fn(x, **kwargs)
+# feedforward
+class GLU(nn.Module):
+    def __init__(self, dim_in, dim_out, activation):
+        super().__init__()
+        self.act = activation
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim = -1)
+        return x * self.act(gate)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None,
+        mult = 4,
+        glu = False,
+        swish = False,
+        relu_squared = False,
+        post_act_ln = False,
+        dropout = 0.,
+        no_bias = False,
+        zero_init_output = False
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        if relu_squared:
+            activation = ReluSquared()
+        elif swish:
+            activation = nn.SiLU()
+        else:
+            activation = nn.GELU()
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim, bias = not no_bias),
+            activation
+        ) if not glu else GLU(dim, inner_dim, activation)
+        self.ff = nn.Sequential(
+            project_in,
+            nn.LayerNorm(inner_dim) if post_act_ln else nn.Identity(),
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out, bias = not no_bias)
+        )
+        # init last linear layer to 0
+        if zero_init_output:
+            init_zero_(self.ff[-1])
+    def forward(self, x):
+        return self.ff(x)
+# attention. it is all we need
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_head = DEFAULT_DIM_HEAD,
+        heads = 8,
+        causal = False,
+        flash = False,
+        talking_heads = False,
+        head_scale = False,
+        sparse_topk = None,
+        num_mem_kv = 0,
+        dropout = 0.,
+        on_attn = False,
+        gate_values = False,
+        zero_init_output = False,
+        max_attend_past = None,
+        qk_norm = False,
+        qk_norm_groups = 1,
+        qk_norm_scale = 10,
+        qk_norm_dim_scale = False,
+        one_kv_head = False,
+        shared_kv = False,
+        value_dim_head = None,
+        tensor_product = False   # https://arxiv.org/abs/2208.06061
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.causal = causal
+        self.max_attend_past = max_attend_past
+        value_dim_head = default(value_dim_head, dim_head)
+        q_dim = k_dim = dim_head * heads
+        v_dim = out_dim = value_dim_head * heads
+        self.one_kv_head = one_kv_head
+        if one_kv_head:
+            k_dim = dim_head
+            v_dim = value_dim_head
+            out_dim = v_dim * heads
+        self.to_q = nn.Linear(dim, q_dim, bias = False)
+        self.to_k = nn.Linear(dim, k_dim, bias = False)
+        # shared key / values, for further memory savings during inference
+        assert not (shared_kv and value_dim_head != dim_head), 'key and value head dimensions must be equal for shared key / values'
+        self.to_v = nn.Linear(dim, v_dim, bias = False) if not shared_kv else None
+        # relations projection from tp-attention
+        self.to_r = nn.Linear(dim, v_dim, bias = False) if tensor_product else None
+        # add GLU gating for aggregated values, from alphafold2
+        self.to_v_gate = None
+        if gate_values:
+            self.to_v_gate = nn.Linear(dim, out_dim)
+            nn.init.constant_(self.to_v_gate.weight, 0)
+            nn.init.constant_(self.to_v_gate.bias, 1)
+        # cosine sim attention
+        self.qk_norm = qk_norm
+        self.qk_norm_groups = qk_norm_groups
+        self.qk_norm_scale = qk_norm_scale
+        # whether to use the rmsnorm (equivalent to cosine sim attention when scale is equal to 1) - https://arxiv.org/abs/2302.05442
+        self.qk_norm_dim_scale = qk_norm_dim_scale
+        self.qk_norm_q_scale = self.qk_norm_k_scale = 1
+        if qk_norm and qk_norm_dim_scale:
+            self.qk_norm_q_scale = nn.Parameter(torch.ones(dim_head))
+            self.qk_norm_k_scale = nn.Parameter(torch.ones(dim_head))
+        assert (not qk_norm) or (dim_head % qk_norm_groups) == 0, 'dimension per attention head must be divisible by the qk norm groups'
+        assert not (qk_norm and (dim_head // qk_norm_groups) <= 2), 'the group dimension may be too small (2 was too small in my tests, but 4 still works, surprisingly)'
+        # attend class - includes core attention algorithm + talking heads
+        self.attend = Attend(
+            heads = heads,
+            causal = causal,
+            talking_heads = talking_heads,
+            dropout = dropout,
+            qk_norm = qk_norm,
+            scale = qk_norm_scale if qk_norm else self.scale,
+            flash = flash
+        )
+        # head scaling
+        self.head_scale = head_scale
+        if head_scale:
+            self.head_scale_params = nn.Parameter(torch.ones(1, heads, 1, 1))
+        # explicit topk sparse attention
+        self.sparse_topk = sparse_topk
+        # add memory key / values
+        self.num_mem_kv = num_mem_kv
+        if num_mem_kv > 0:
+            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+        # attention on attention
+        self.attn_on_attn = on_attn
+        self.to_out = nn.Sequential(nn.Linear(out_dim, dim * 2, bias = False), nn.GLU()) if on_attn else nn.Linear(out_dim, dim, bias = False)
+        # init output projection 0
+        if zero_init_output:
+            init_zero_(self.to_out)
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        context_mask = None,
+        attn_mask = None,
+        rel_pos = None,
+        rotary_pos_emb = None,
+        prev_attn = None,
+        mem = None
+    ):
+        b, n, _, h, head_scale, device, has_context = *x.shape, self.heads, self.head_scale, x.device, exists(context)
+        kv_input = default(context, x)
+        q_input = x
+        k_input = kv_input
+        v_input = kv_input
+        r_input = x
+        if exists(mem):
+            k_input = torch.cat((mem, k_input), dim = -2)
+            v_input = torch.cat((mem, v_input), dim = -2)
+        q = self.to_q(q_input)
+        k = self.to_k(k_input)
+        v = self.to_v(v_input) if exists(self.to_v) else k
+        r = self.to_r(r_input) if exists(self.to_r) else None
+        q = rearrange(q, 'b n (h d) -> b h n d', h = h)
+        if not self.one_kv_head:
+            k, v, r = map(lambda t: maybe(rearrange)(t, 'b n (h d) -> b h n d', h = h), (k, v, r))
+        if self.qk_norm:
+            qk_l2norm = partial(l2norm, groups = self.qk_norm_groups)
+            q, k = map(qk_l2norm, (q, k))
+            q = q * self.qk_norm_q_scale
+            k = k * self.qk_norm_k_scale
+        if exists(rotary_pos_emb) and not has_context:
+            freqs, xpos_scale = rotary_pos_emb
+            l = freqs.shape[-1]
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale ** -1.) if exists(xpos_scale) else (1., 1.)
+            (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v))
+            ql, kl, vl = map(lambda arg: apply_rotary_pos_emb(arg[0], freqs, arg[1]), ((ql, q_xpos_scale), (kl, k_xpos_scale), (vl, k_xpos_scale)))
+            q, k, v = map(lambda t: torch.cat(t, dim = -1), ((ql, qr), (kl, kr), (vl, vr)))
+        input_mask = default(context_mask, mask)
+        if self.num_mem_kv > 0:
+            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b = b), (self.mem_k, self.mem_v))
+            if self.qk_norm:
+                mem_k = l2norm(mem_k)
+                mem_k = mem_k * self.qk_norm_k_scale
+            k = torch.cat((mem_k, k), dim = -2)
+            v = torch.cat((mem_v, v), dim = -2)
+            if exists(input_mask):
+                input_mask = pad_at_dim(input_mask, (self.num_mem_kv, 0), dim = -1, value = True)
+        i, j = map(lambda t: t.shape[-2], (q, k))
+        # determine masking
+        max_neg_value(q)
+        masks = []
+        final_attn_mask = None
+        if exists(input_mask):
+            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
+            masks.append(~input_mask)
+        if exists(attn_mask):
+            assert 2 <= attn_mask.ndim <= 4, 'attention mask must have greater than 2 dimensions but less than or equal to 4'
+            if attn_mask.ndim == 2:
+                attn_mask = rearrange(attn_mask, 'i j -> 1 1 i j')
+            elif attn_mask.ndim == 3:
+                attn_mask = rearrange(attn_mask, 'h i j -> 1 h i j')
+            masks.append(~attn_mask)
+        if exists(self.max_attend_past):
+            range_q = torch.arange(j - i, j, device = device)
+            range_k = torch.arange(j, device = device)
+            dist = rearrange(range_q, 'i -> 1 1 i 1') - rearrange(range_k, 'j -> 1 1 1 j')
+            max_attend_past_mask = dist > self.max_attend_past
+            masks.append(max_attend_past_mask)
+        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
+            top, _ = dots.topk(self.sparse_topk, dim = -1)
+            vk = rearrange(top[..., -1], '... -> ... 1')
+            sparse_topk_mask = dots < vk
+            masks.append(sparse_topk_mask)
+        if len(masks) > 0:
+            final_attn_mask = or_reduce(masks)
+        # prepare relative positional bias, if needed
+        attn_bias = None
+        if exists(rel_pos):
+            attn_bias = rel_pos(i, j)
+        # attention is all we need
+        out, intermediates = self.attend(
+            q, k, v,
+            mask = final_attn_mask,
+            attn_bias = attn_bias,
+            prev_attn = prev_attn
+        )
+        # https://arxiv.org/abs/2208.06061 proposes to add a residual for better gradients
+        if exists(r):
+            out = out * r + out
+        # normformer scaling of heads
+        if head_scale:
+            out = out * self.head_scale_params
+        # merge heads
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        # alphafold2 styled gating of the values
+        if exists(self.to_v_gate):
+            gates = self.to_v_gate(x)
+            out = out * gates.sigmoid()
+        # combine the heads
+        out = self.to_out(out)
+        if exists(mask):
+            mask = rearrange(mask, 'b n -> b n 1')
+            out = out.masked_fill(~mask, 0.)
+        return out, intermediates
+class AttentionLayers(nn.Module):
+    def __init__(
+        self,
+        dim,
+        depth,
+        heads = None,
+        causal = False,
+        cross_attend = False,
+        only_cross = False,
+        use_scalenorm = False,
+        use_rmsnorm = False,
+        alibi_pos_bias = False,
+        alibi_num_heads = None,
+        alibi_learned = False,
+        rel_pos_bias = False,
+        rel_pos_num_buckets = 32,
+        rel_pos_max_distance = 128,
+        dynamic_pos_bias = False,
+        dynamic_pos_bias_log_distance = False,
+        dynamic_pos_bias_mlp_depth = 2,
+        dynamic_pos_bias_norm = False,
+        rotary_pos_emb = False,
+        rotary_emb_dim = None,
+        rotary_xpos = False,
+        rotary_interpolation_factor=1.,
+        rotary_xpos_scale_base = 512,
+        rotary_base_rescale_factor=1.,
+        custom_layers = None,
+        sandwich_coef = None,
+        par_ratio = None,
+        residual_attn = False,
+        cross_residual_attn = False,
+        macaron = False,
+        pre_norm = True,
+        gate_residual = False,
+        scale_residual = False,
+        scale_residual_constant = 1.,
+        deepnorm = False,
+        shift_tokens = 0,
+        sandwich_norm = False,
+        resi_dual = False,
+        zero_init_branch_output = False,
+        layer_dropout = 0.,
+        cross_attn_tokens_dropout = 0.,
+        **kwargs
+    ):
+        super().__init__()
+        rotary_pos_emb = rotary_pos_emb or rotary_xpos
+        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
+        attn_kwargs, kwargs = groupby_prefix_and_trim('attn_', kwargs)
+        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
+        self.dim = dim
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+        self.has_pos_emb = rel_pos_bias or rotary_pos_emb
+        rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32)
+        assert not (rotary_xpos and not causal), 'rotary xpos is not compatible with bidirectional attention'
+        self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim, use_xpos = rotary_xpos, scale_base = rotary_xpos_scale_base, interpolation_factor=rotary_interpolation_factor, base_rescale_factor=rotary_base_rescale_factor) if rotary_pos_emb else None
+        assert not (alibi_pos_bias and rel_pos_bias), 'you can only choose Alibi positional bias or T5 relative positional bias, not both'
+        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
+        # relative positional bias
+        flash_attn = attn_kwargs.get('flash', False)
+        assert (int(rel_pos_bias) + int(dynamic_pos_bias) + int(alibi_pos_bias)) <= 1, 'you can only choose up to one of t5, alibi, or dynamic positional bias'
+        self.rel_pos = None
+        if rel_pos_bias:
+            assert not flash_attn, 'flash attention not compatible with t5 relative positional bias'
+            self.rel_pos = RelativePositionBias(scale = dim_head ** 0.5, causal = causal, heads = heads, num_buckets = rel_pos_num_buckets, max_distance = rel_pos_max_distance)
+        elif dynamic_pos_bias:
+            assert not flash_attn, 'flash attention not compatible with dynamic positional bias'
+            self.rel_pos = DynamicPositionBias(dim = dim // 4, heads = heads, log_distance = dynamic_pos_bias_log_distance, depth = dynamic_pos_bias_mlp_depth, norm = dynamic_pos_bias_norm)
+        elif alibi_pos_bias:
+            alibi_num_heads = default(alibi_num_heads, heads)
+            assert alibi_num_heads <= heads, 'number of ALiBi heads must be less than the total number of heads'
+            alibi_pos_klass = LearnedAlibiPositionalBias if alibi_learned else AlibiPositionalBias
+            self.rel_pos = alibi_pos_klass(heads = alibi_num_heads, total_heads = heads)
+        # determine deepnorm and residual scale
+        if deepnorm:
+            assert scale_residual_constant == 1, 'scale residual constant is being overridden by deep norm settings'
+            pre_norm = sandwich_norm = resi_dual = False
+            scale_residual = True
+            scale_residual_constant = (2 * depth) ** 0.25
+        assert (int(sandwich_norm) + int(resi_dual)) <= 1, 'either sandwich norm or resiDual is selected, but not both'
+        assert not (not pre_norm and sandwich_norm), 'sandwich norm cannot be used when not using prenorm'
+        assert not (not pre_norm and resi_dual), 'resiDualcannot be used when not using prenorm'
+        self.pre_norm = pre_norm
+        self.sandwich_norm = sandwich_norm
+        self.resi_dual = resi_dual
+        self.residual_attn = residual_attn
+        self.cross_residual_attn = cross_residual_attn
+        self.cross_attend = cross_attend
+        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
+        norm_class = RMSNorm if use_rmsnorm else norm_class
+        norm_fn = partial(norm_class, dim)
+        if cross_attend and not only_cross:
+            default_block = ('a', 'c', 'f')
+        elif cross_attend and only_cross:
+            default_block = ('c', 'f')
+        else:
+            default_block = ('a', 'f')
+        if macaron:
+            default_block = ('f',) + default_block
+        # zero init
+        if zero_init_branch_output:
+            attn_kwargs = {**attn_kwargs, 'zero_init_output':  True}
+            ff_kwargs = {**ff_kwargs, 'zero_init_output':  True}
+        # calculate layer block order
+        if exists(custom_layers):
+            layer_types = custom_layers
+        elif exists(par_ratio):
+            par_depth = depth * len(default_block)
+            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
+            default_block = tuple(filter(not_equals('f'), default_block))
+            par_attn  = par_depth // par_ratio
+            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            par_width = (depth_cut + depth_cut // par_attn) // par_attn
+            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
+            par_block = default_block + ('f',) * (par_width - len(default_block))
+            par_head = par_block * par_attn
+            layer_types = par_head + ('f',) * (par_depth - len(par_head))
+        elif exists(sandwich_coef):
+            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
+            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
+        else:
+            layer_types = default_block * depth
+        self.layer_types = layer_types
+        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
+        # stochastic depth
+        self.layer_dropouts = cast_tuple(layer_dropout, len(layer_types))
+        # structured dropout for cross attending
+        self.cross_attn_tokens_dropout = cross_attn_tokens_dropout
+        # calculate token shifting
+        shift_tokens = cast_tuple(shift_tokens, len(layer_types))
+        # iterate and construct layers
+        for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)):
+            is_last_layer = ind == (len(self.layer_types) - 1)
+            if layer_type == 'a':
+                layer = Attention(dim, heads = heads, causal = causal, **attn_kwargs)
+            elif layer_type == 'c':
+                layer = Attention(dim, heads = heads, **attn_kwargs)
+            elif layer_type == 'f':
+                layer = FeedForward(dim, **ff_kwargs)
+                layer = layer if not macaron else Scale(0.5, layer)
+            else:
+                raise Exception(f'invalid layer type {layer_type}')
+            if layer_shift_tokens > 0:
+                shift_range_upper = layer_shift_tokens + 1
+                shift_range_lower = -layer_shift_tokens if not causal else 0
+                layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer)
+            residual_fn = GRUGating if gate_residual else Residual
+            residual = residual_fn(dim, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant)
+            pre_branch_norm = norm_fn() if pre_norm else None
+            post_branch_norm = norm_fn() if sandwich_norm else None
+            post_main_norm = norm_fn() if (resi_dual or not pre_norm) and not is_last_layer else None
+            norms = nn.ModuleList([
+                pre_branch_norm,
+                post_branch_norm,
+                post_main_norm
+            ])
+            self.layers.append(nn.ModuleList([
+                norms,
+                layer,
+                residual
+            ]))
+            self.layers_length = len(self.layers) # It doesn't work if called after
+        if deepnorm:
+            init_gain = (8 * depth) ** -0.25
+            deepnorm_init(self, init_gain)
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        context_mask = None,
+        attn_mask = None,
+        self_attn_context_mask = None,
+        mems = None,
+        return_hiddens = False
+    ):
+        assert not (self.cross_attend ^ exists(context)), 'context must be passed in if cross_attend is set to True'
+        hiddens = []
+        intermediates = []
+        prev_attn = None
+        prev_cross_attn = None
+        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+        rotary_pos_emb = None
+        if exists(self.rotary_pos_emb):
+            max_rotary_emb_length = max(list(map(lambda m: (m.shape[1] if exists(m) else 0) + x.shape[1], mems)))
+            rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device)
+        outer_residual = x
+        for ind, (layer_type, (norm, block, residual_fn), layer_dropout) in enumerate(zip(self.layer_types, self.layers, self.layer_dropouts)):
+            ind == (self.layers_length - 1)
+            if self.training and layer_dropout > 0. and random() < layer_dropout:
+                continue
+            if layer_type == 'a':
+                if return_hiddens:
+                    hiddens.append(x)
+                layer_mem = mems.pop(0) if mems else None
+            if layer_type == 'c':
+                if self.training and self.cross_attn_tokens_dropout > 0.:
+                    context, context_mask = dropout_seq(context, context_mask, self.cross_attn_tokens_dropout)
+            inner_residual = x
+            pre_norm, post_branch_norm, post_main_norm = norm
+            if exists(pre_norm) and not self.resi_dual:
+                x = pre_norm(x)
+            if layer_type == 'a':
+                out, inter = block(x, mask = mask, context_mask = self_attn_context_mask, attn_mask = attn_mask, rel_pos = self.rel_pos, rotary_pos_emb = rotary_pos_emb, prev_attn = prev_attn, mem = layer_mem)
+            elif layer_type == 'c':
+                out, inter = block(x, context = context, mask = mask, context_mask = context_mask, prev_attn = prev_cross_attn)
+            elif layer_type == 'f':
+                out = block(x)
+            if self.resi_dual:
+                outer_residual = residual_fn(out, outer_residual)
+            if exists(post_branch_norm):
+                out = post_branch_norm(out)
+            x = residual_fn(out, inner_residual)
+            if layer_type in ('a', 'c') and return_hiddens:
+                intermediates.append(inter)
+            if layer_type == 'a' and self.residual_attn:
+                prev_attn = inter.pre_softmax_attn
+            elif layer_type == 'c' and self.cross_residual_attn:
+                prev_cross_attn = inter.pre_softmax_attn
+            if exists(post_main_norm):
+                x = post_main_norm(x)
+            if self.resi_dual:
+                x = x + pre_norm(outer_residual)
+        if return_hiddens:
+            intermediates = LayerIntermediates(
+                hiddens = hiddens,
+                attn_intermediates = intermediates
+            )
+            return x, intermediates
+        return x
+class Decoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on decoder'
+        super().__init__(causal = True, **kwargs)
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        num_tokens,
+        max_seq_len,
+        attn_layers,
+        # tokenizer: BaseTokenizer,
+        embedding_provider: BaseEmbedding,
+        emb_dim = None,
+        max_mem_len = 0.,
+        shift_mem_down = 0,
+        emb_dropout = 0.,
+        post_emb_norm = False,
+        num_memory_tokens = None,
+        tie_embedding = False,
+        logits_dim = None,
+        use_abs_pos_emb = True,
+        scaled_sinu_pos_emb = False,
+        l2norm_embed = False,
+        emb_frac_gradient = 1. # GLM-130B and Cogview successfully used this, set at 0.1
+    ):
+        super().__init__()
+        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
+        dim = attn_layers.dim
+        emb_dim = default(emb_dim, dim)
+        self.emb_dim = emb_dim
+        self.num_tokens = num_tokens
+        self.max_seq_len = max_seq_len
+        self.max_mem_len = max_mem_len
+        self.shift_mem_down = shift_mem_down
+        self.l2norm_embed = l2norm_embed
+        self.token_emb = TokenEmbedding(emb_dim, num_tokens, embedding_provider, l2norm_embed=l2norm_embed)
+        if not (use_abs_pos_emb and not attn_layers.has_pos_emb):
+            self.pos_emb = always(0)
+        elif scaled_sinu_pos_emb:
+            self.pos_emb = ScaledSinusoidalEmbedding(emb_dim)
+        else:
+            self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len, l2norm_embed = l2norm_embed)
+        self.emb_frac_gradient = emb_frac_gradient # fraction of the gradient that should go to the embedding, https://arxiv.org/abs/2105.13290
+        self.post_emb_norm = nn.LayerNorm(emb_dim) if post_emb_norm else nn.Identity()
+        self.emb_dropout = nn.Dropout(emb_dropout)
+        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        self.attn_layers = attn_layers
+        self.norm = nn.LayerNorm(dim)
+        self.init_()
+        logits_dim = default(logits_dim, num_tokens)
+        self.to_logits = nn.Linear(dim, logits_dim) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+        # memory tokens (like [cls]) from Memory Transformers paper
+        num_memory_tokens = default(num_memory_tokens, 0)
+        self.num_memory_tokens = num_memory_tokens
+        if num_memory_tokens > 0:
+            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+    def init_(self):
+        if self.l2norm_embed:
+            nn.init.normal_(self.token_emb.emb.weight, std = 1e-5)
+            if not isinstance(self.pos_emb, always):
+                nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
+            return
+        nn.init.kaiming_normal_(self.token_emb.emb.weight)
+    def forward(
+        self,
+        x,
+        return_embeddings = False,
+        return_logits_and_embeddings = False,
+        return_intermediates = False,
+        mask = None,
+        return_mems = False,
+        return_attn = False,
+        mems = None,
+        pos = None,
+        prepend_embeds = None,
+        sum_embeds = None,
+        **kwargs
+    ):
+        b, n, device, num_mem, emb_frac_gradient = *x.shape, x.device, self.num_memory_tokens, self.emb_frac_gradient
+        return_hiddens = return_mems | return_attn
+        # absolute positional embedding
+        external_pos_emb = exists(pos) and pos.dtype != torch.long
+        pos_emb = self.pos_emb(x, pos = pos) if not external_pos_emb else pos
+        x = self.token_emb(x) + pos_emb
+        # for summing embeddings passed externally - needs this for self-conditioning in non-autoregressive training
+        if exists(sum_embeds):
+            x = x + sum_embeds
+        # post embedding norm, purportedly leads to greater stabilization
+        x = self.post_emb_norm(x)
+        # whether to append embeds, as in PaLI, for image embeddings
+        if exists(prepend_embeds):
+            prepend_seq, prepend_dim = prepend_embeds.shape[1:]
+            assert prepend_dim == x.shape[-1], 'prepended embeddings need to have same dimensions as text model dimensions'
+            x = torch.cat((prepend_embeds, x), dim = -2)
+        # whether to reduce the gradient going to the embedding, from cogview paper, corroborated by GLM-130B model
+        if emb_frac_gradient < 1:
+            assert emb_frac_gradient > 0
+            x = x * emb_frac_gradient + x.detach() * (1 - emb_frac_gradient)
+        # embedding dropout
+        x = self.emb_dropout(x)
+        x = self.project_emb(x)
+        if num_mem > 0:
+            mem = repeat(self.memory_tokens, 'n d -> b n d', b = b)
+            x = torch.cat((mem, x), dim = 1)
+            # auto-handle masking after appending memory tokens
+            if exists(mask):
+                mask = pad_at_dim(mask, (num_mem, 0), dim = -1, value = True)
+        if self.shift_mem_down and exists(mems):
+            mems_l, mems_r = mems[:self.shift_mem_down], mems[self.shift_mem_down:]
+            mems = [*mems_r, *mems_l]
+        if return_hiddens:
+            x, intermediates = self.attn_layers(x, mask = mask, mems = mems, return_hiddens = True, **kwargs)
+        else:
+            x = self.attn_layers(x, mask = mask, mems = mems, **kwargs)
+        x = self.norm(x)
+        mem, x = x[:, :num_mem], x[:, num_mem:]
+        if return_logits_and_embeddings:
+            out = (self.to_logits(x), x)
+        elif return_embeddings:
+            out = x
+        else:
+            out = self.to_logits(x)
+        if return_intermediates:
+            return out, intermediates
+        if return_mems:
+            hiddens = intermediates.hiddens
+            new_mems = list(map(lambda pair: torch.cat(pair, dim = -2), zip(mems, hiddens))) if exists(mems) else hiddens
+            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
+            return out, new_mems
+        if return_attn:
+            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            return out, attn_maps
+        return out

Andromeda/dataset_prep/__init__.py ADDED Viewed

File without changes

Andromeda/dataset_prep/books.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# from Andromeda.dataset_builder import DatasetBuilder
+from build_dataset import DatasetBuilder
+builder = DatasetBuilder(
+    dataset_name="the_pile_books3",
+    seq_len=8192,
+    num_cpu=4,
+    hf_account_repo="kye/the_pile_books3_GPTNeox-8192",
+    tokenizer="EleutherAI/gpt-neox-20b",
+)
+dataset = builder.build_dataset()

Andromeda/inference.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import torch
+from transformers import AutoTokenizer
+from einops._torch_specific import allow_ops_in_compiled_graph
+import argparse
+# class AndromedaEval:
+#     def __init__(self, path, seed=42, device=None):
+#         self.path = path
+#         self.seed = seed
+#         self.device = device
+#         if self.device is None:
+#             self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+#         set_seed(self.seed)
+#         #tokenizer
+#         self.tokenizer = AndromedaTokenizer
+#         #model
+#         self.model = Andromeda
+#         #checkpoint
+#         self.model.load_state_dict(torch.load(self.path))
+#         self.model.eval()
+#         #device
+#         self.model = self.model.to(self.device)
+#         #metrics
+#         self.metrics = {}
+#         self.reset_metrics()
+#     def reset_metrics(self):
+#         self.metrics = {
+#             "generation_steps": None,
+#             "time_forward": [],
+#             "time_forward_average": None,
+#             "memory_usages": [],
+#             "memory_usage_average": None,
+#             "time_end_to_end": None,
+#             "throughput": None
+#         }
+#     def get_num_params(self):
+#         num_params = sum(param.numel() for param in self.model.parameters() if param.requires_grad)
+#         return num_params
+#     def generate(self, prompt, generation_steps=32):
+#         #make sure all of the metrics reset at every generation
+#         self.reset_metrics()
+#         self.metrics["generation_steps"] = generation_steps
+#         tokens = self.tokenizer.encode(prompt)
+#         tokens_new = []
+#         time_end_to_end = time.time()
+#         #generation loop
+#         for _ in range(generation_steps):
+#             tokens_tensor = torch.tensor([tokens], device=self.device)
+#             #forward pass
+#             tracemalloc.start()
+#             time_forward_0 = time.time()
+#             logits = self.model(tokens_tensor, return_loss=False)[:, -1] # no loss takes the output of the last tokens
+#             time_forward_1 = time.time()
+#             _, memory_usage = tracemalloc.get_traced_memory()
+#             tracemalloc.stop()
+#             self.metrics["memory_usages"].append(memory_usage)
+#             time_forward = time_forward_1 - time_forward_0
+#             self.metrics["times_forward"].append(time_forward)
+#             next_token = torch.armax(logits).item()
+#             #save the newly generated token
+#             tokens.append(next_token)
+#             tokens_new.append(next_token)
+#         time_end_to_end_1 = time.time()
+#         time_end_to_end = time_end_to_end_1 - time_end_to_end_0
+#         self.metrics["time_end_to_end"] = time_end_to_end
+#         decoded = self.tokenizer.decode(tokens)
+#         self.metrics["time_forward_average"] = np.mean(self.metrics["times_forward"])
+#         self.metrics["memory_usage_average"] = np.mean(self.metrics["memory_usage"])
+#         self.metrics['throughput'] = generation_steps / np.sum(self.metrics["times_forward"])
+#         return tokens_new, decoded
+# def main():
+#     prompt = 'My name is'
+#     andromeda = EvalAndromeda(path='checkpoints/step_44927_6656/pytorch_model.bin')
+#     num_params = Andromeda.get_num_params()
+#     print(f'The model has {num_params} parameters')
+#     _, output = Andromeda.generate(prompt)
+#     for metric, value in Andromeda.metrics.items():
+#         print(f'{metric}: {value}\n')
+#     print('\n')
+#     print(output)
+def main():
+    allow_ops_in_compiled_graph()
+    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
+    parser = argparse.ArgumentParser(description="Generate text using Andromeda model")
+    parser.add_argument("prompt", type=str, help="Text prompt to generate text")
+    parser.add_argument(
+        "--seq_len", type=int, default=256, help="Sequence length for generated text"
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.8, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--filter_thres", type=float, default=0.9, help="Filter threshold for sampling"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="andromeda-e-1",
+        help="Model to use for generation",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp32",
+        help="Data type for the model: 'bf16', or 'fp32'",
+    )
+    args = parser.parse_args()
+    dtype = torch.float32
+    if args.dtype == 'bf16':
+        dtype = torch.bfloat16
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    #need to submit to torch hub
+    model = torch.hub.load("apacai/andromeda", args.model).to(device).to(dtype)
+    opt_model = torch.compile(model, backend="hidet")
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    encoded_text = tokenizer(args.prompt, return_tensors="pt")
+    output_tensor = opt_model.generate(
+        seq_len=args.seq_len,
+        prompt=encoded_text["input_ids"].to(device),
+        temperature=args.temperature,
+        filter_thres=args.filter_thres,
+        pad_value=0.0,
+        eos_token=tokenizer.eos_token_id,
+        return_seq_without_prompt=False,
+        use_tqdm=True,
+    )
+    decoded_output = tokenizer.batch_decode(output_tensor, skip_special_tokens=True)
+    return decoded_output
+if __name__ == "__main__":
+    generated_text = main()
+    for text in generated_text:
+        print(f"{text}")

Andromeda/model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from torch.nn import Module
+from Andromeda.core.transformer import Transformer, AutoregressiveWrapper, AndromedaEmbedding, Decoder
+from transformers import AutoTokenizer
+class AndromedaTokenizer:
+    def __init__(self):
+        self.tokenizer= AutoTokenizer.from_pretrained(
+            "EleutherAI/gpt-neox-20b",
+            eos_token="<eos>",
+            pad_token="<pad>",
+            extra_ids=0,
+            model_max_length=8192
+        )
+    def tokenize_texts(self, texts):
+        return self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids
+    def decode(self, texts):
+        return self.tokenizer.decode(texts)
+    def __len__(self):
+        num_tokens = len(self.tokenizer)
+        return num_tokens
+class Andromeda(Module):
+    """
+    Andromeda is a transformer-based model architecture. It initializes with
+    a Transformer and AutoregressiveWrapper with default or user-specified parameters.
+    """
+    def __init__(self,
+                 num_tokens=50432,
+                 max_seq_len=8192,
+                 dim=2560,
+                 depth=32,
+                 dim_head=128,
+                 heads=24,
+                 use_abs_pos_emb=False,
+                 alibi_pos_bias=True,
+                 alibi_num_heads=12,
+                 rotary_xpos=True,
+                 attn_flash=True,
+                #  shift_tokens=1,
+                 attn_one_kv_head=True,  # multiquery attention
+                 qk_norm=True,
+                 attn_qk_norm=True,
+                 attn_qk_norm_dim_scale=True,
+                 embedding_provider=AndromedaEmbedding()):
+        """
+        Initialize the model with specified or default parameters.
+        Args:
+        - num_tokens: Number of tokens in the vocabulary
+        - max_seq_len: Maximum sequence length
+        - dim: Dimension of the model
+        - depth: Depth of the model
+        - dim_head: Dimension of the model head
+        - heads: Number of heads
+        - use_abs_pos_emb: Whether to use absolute position embedding
+        - alibi_pos_bias: Alibi position bias
+        - alibi_num_heads: Number of alibi heads
+        - rotary_xpos: Rotary position
+        - attn_flash: Attention flash
+        - deepnorm: Deep normalization
+        - shift_tokens: Number of tokens to shift
+        - attn_one_kv_head: Attention one key/value head
+        - qk_norm: Query-key normalization
+        - attn_qk_norm: Attention query-key normalization
+        - attn_qk_norm_dim_scale: Attention query-key normalization dimension scale
+        - embedding_provider: Embedding provider module
+        """
+        super().__init__()
+        try:
+            self.Andromeda = Transformer(
+                num_tokens=num_tokens,
+                max_seq_len=max_seq_len,
+                use_abs_pos_emb=use_abs_pos_emb,
+                embedding_provider=embedding_provider,
+                attn_layers=Decoder(
+                    dim=dim,
+                    depth=depth,
+                    dim_head=dim_head,
+                    heads=heads,
+                    alibi_pos_bias=alibi_pos_bias,
+                    alibi_num_heads=alibi_num_heads,
+                    rotary_xpos=rotary_xpos,
+                    attn_flash=attn_flash,
+                    # deepnorm=deepnorm,
+                    # shift_tokens=shift_tokens,
+                    attn_one_kv_head=attn_one_kv_head,
+                    qk_norm=qk_norm,
+                    attn_qk_norm=attn_qk_norm,
+                    attn_qk_norm_dim_scale=attn_qk_norm_dim_scale
+                )
+            )
+            self.decoder = AutoregressiveWrapper(self.Andromeda)
+        except Exception as e:
+            print("Failed to initialize Andromeda: ", e)
+            raise
+    def forward(self, text_tokens, **kwargs):
+        """
+        Forward pass through the model. It expects the input text_tokens.
+        Args:
+        - text_tokens: Input tokens
+        - kwargs: Other arguments
+        Returns:
+        - output from the decoder
+        """
+        try:
+            model_input = self.decoder.forward(text_tokens)[0]
+            return self.decoder(model_input, padded_x=model_input[0])
+        except Exception as e:
+            print("Failed in forward method: ", e)
+            raise

Andromeda/old/__init__.py ADDED Viewed

File without changes

Andromeda/old/sophia.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+from torch import Tensor
+from torch.optim.optimizer import Optimizer
+from typing import List
+class SophiaG(Optimizer):
+    def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho = 0.04,
+         weight_decay=1e-1, *, maximize: bool = False,
+         capturable: bool = False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= rho:
+            raise ValueError("Invalid rho parameter at index 1: {}".format(rho))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, rho=rho,
+                        weight_decay=weight_decay,
+                        maximize=maximize, capturable=capturable)
+        super(SophiaG, self).__init__(params, defaults)
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('maximize', False)
+            group.setdefault('capturable', False)
+        state_values = list(self.state.values())
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        if not step_is_tensor:
+            for s in state_values:
+                s['step'] = torch.tensor(float(s['step']))
+    @torch.no_grad()
+    def update_hessian(self):
+        for group in self.param_groups:
+            beta1, beta2 = group['betas']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                state = self.state[p]
+                if len(state) == 0:
+                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
+                        if self.defaults['capturable'] else torch.tensor(0.)
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                if 'hessian' not in state.keys():
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=1 - beta2)
+    @torch.no_grad()
+    def step(self, closure=None, bs=5120):
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            state_steps = []
+            hessian = []
+            beta1, beta2 = group['betas']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('Hero does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
+                        if self.defaults['capturable'] else torch.tensor(0.)
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                if 'hessian' not in state.keys():
+                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                exp_avgs.append(state['exp_avg'])
+                state_steps.append(state['step'])
+                hessian.append(state['hessian'])
+                if self.defaults['capturable']:
+                    bs = torch.ones((1,), dtype=torch.float, device=p.device) * bs
+            sophiag(params_with_grad,
+                  grads,
+                  exp_avgs,
+                  hessian,
+                  state_steps,
+                  bs=bs,
+                  beta1=beta1,
+                  beta2=beta2,
+                  rho=group['rho'],
+                  lr=group['lr'],
+                  weight_decay=group['weight_decay'],
+                  maximize=group['maximize'],
+                  capturable=group['capturable'])
+        return loss
+def sophiag(params: List[Tensor],
+          grads: List[Tensor],
+          exp_avgs: List[Tensor],
+          hessian: List[Tensor],
+          state_steps: List[Tensor],
+          capturable: bool = False,
+          *,
+          bs: int,
+          beta1: float,
+          beta2: float,
+          rho: float,
+          lr: float,
+          weight_decay: float,
+          maximize: bool):
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+    func = _single_tensor_sophiag
+    func(params,
+         grads,
+         exp_avgs,
+         hessian,
+         state_steps,
+         bs=bs,
+         beta1=beta1,
+         beta2=beta2,
+         rho=rho,
+         lr=lr,
+         weight_decay=weight_decay,
+         maximize=maximize,
+         capturable=capturable)
+def _single_tensor_sophiag(params: List[Tensor],
+                         grads: List[Tensor],
+                         exp_avgs: List[Tensor],
+                         hessian: List[Tensor],
+                         state_steps: List[Tensor],
+                         *,
+                         bs: int,
+                         beta1: float,
+                         beta2: float,
+                         rho: float,
+                         lr: float,
+                         weight_decay: float,
+                         maximize: bool,
+                         capturable: bool):
+    for i, param in enumerate(params):
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        hess = hessian[i]
+        step_t = state_steps[i]
+        if capturable:
+            assert param.is_cuda and step_t.is_cuda and bs.is_cuda
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            hess = torch.view_as_real(hess)
+            param = torch.view_as_real(param)
+        # update step
+        step_t += 1
+        # Perform stepweight decay
+        param.mul_(1 - lr * weight_decay)
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        if capturable:
+            step_size = lr
+            step_size_neg = step_size.neg()
+            ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1)
+            param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg)
+        else:
+            step_t.item()
+            step_size_neg = - lr
+            ratio = (exp_avg.abs() / (rho * bs * hess + 1e-15)).clamp(None,1)
+            param.addcmul_(exp_avg.sign(), ratio, value=step_size_neg)

Andromeda/old/training.py ADDED Viewed

	@@ -0,0 +1,294 @@

+#quantization + paralleism
+import time
+import torch
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import DataLoader
+from transformers import default_data_collator, get_linear_schedule_with_warmup
+from accelerate import Accelerator
+from rich.progress import Progress
+from lion_pytorch import Lion
+# from x_transformers import Transformer, Decoder, AutoregressiveWrapper
+from optimus_prim import Transformer, Decoder, AutoregressiveWrapper
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+import torch.distributed as dist
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel,
+    CPUOffload,
+)
+from torch.distributed.fsdp.wrap import (
+    default_auto_wrap_policy,
+)
+from transformers import AutoTokenizer
+#logging
+import boto3
+#training
+import wandb
+from torch.utils.tensorboard import SummaryWriter
+class CustomGPTNeoXTokenizer:
+    def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    def tokenize(self, text):
+        return self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
+custom_tokenizer = CustomGPTNeoXTokenizer()
+Andromeda = Transformer(
+    num_tokens=64007,
+    max_seq_len=8192,
+    use_abs_pos_emb = False,
+    tokenizer=custom_tokenizer,
+    attn_layers = Decoder(
+        dim=2048,
+        depth=6,
+        heads=16,
+        alibi_pos_bias=True,
+        alibi_num_heads=8,
+        rotary_xpos=True,
+        attn_flash = True,
+        deepnorm=True,
+        shift_tokens=1,
+        attn_one_kv_head = True,
+        qk_norm=True
+    )
+)
+Andromeda = AutoregressiveWrapper(Andromeda)
+AWS_ACCESS_KEY_ID=""
+AWS_SECRET_ACCESS_KEY="d"
+def save_model_to_s3(model, bucket_name, key_prefix, step):
+    s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
+    model_path = f"checkpoint_at_step_{step}.pt"
+    torch.save(model.state_dict(), model_path)
+    s3.upload_file(model_path, bucket_name, f"{key_prefix}/{model_path}")
+def count_number_of_parameters(model, only_trainable: bool = True) -> int:
+    if only_trainable:
+        num_params: int = sum(p.numel()
+                              for p in model.parameters() if p.requires_grad)
+    else:
+        num_params: int = sum(p.numel() for p in model.parameters() if p)
+    return int(num_params)
+def prep_sample(sample):
+    title = sample["title"]
+    text = sample["text"]
+    return {
+        "title": title,
+        "text": text
+    }
+def train(args):
+    if args.use_ddp:
+        dist.init_process_group(backend="nccl")
+    accelerator = Accelerator(
+        mixed_precision="fp16",
+        gradient_accumulation_steps=1,
+    )
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    #v1
+    model = Andromeda()
+    if args.use_ddp:
+        model = DistributedDataParallel(model)
+    else:
+        model = DataParallel(model)
+    fsdp_model = FullyShardedDataParallel(
+        model(),
+        fsdp_auto_wrap_policy=default_auto_wrap_policy,
+        cpu_offload=CPUOffload(offload_params=True),
+    )
+    fsdp_model = fsdp_model.to(accelerator.device)
+    #device count
+    if torch.cuda.device_count() > 1:
+        print(f"Let's use ${torch.cuda.device_count()} GPUS")
+    optimizer = Lion(model.parameters(), lr=args.learning_rate / 3, weight_decay=args.weight_decay * 3)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=args.warmup_steps,
+        num_training_steps=args.max_steps,
+    )
+    # tokenizer = KosmosTokenizer()
+    #====================> load data #====================> load data #====================> load data
+    dataset = load_dataset("the_pile_books3")
+    # dataset = dataset.map(prep_sample, num_proc=8)
+    dataset = dataset.map(prep_sample, num_proc=8)
+    #new removed columns
+    remove_columns = ['title']
+    dataset = dataset.map(Andromeda.decoder.tokenizer, batched=True,
+                          batch_size=128, remove_columns=remove_columns)
+    train_dataloader = DataLoader(
+        dataset, collate_fn=default_data_collator, batch_size=args.batch_size, pin_memory=True
+    )
+    #====================> load data #====================> load data #====================> load data #====================> load data
+    fsdp_model, train_dataloader, optimizer, lr_scheduler = accelerator.prepare(fsdp_model, train_dataloader, optimizer,
+                                                                           lr_scheduler)
+    fsdp_model.train()
+    accelerator.register_for_checkpointing(lr_scheduler)
+    accelerator.print(
+        f"Number of parameters: {count_number_of_parameters(model):,}")
+    accelerator.print(
+        f"Number of trainable parameters: {count_number_of_parameters(model, only_trainable=True):,}")
+    # Log model and optimizer parameters to wandb
+    accelerator.init_trackers(project_name="Andromeda")
+    #wandb
+    wandb.init(project="Andromeda", config=args)
+    #init tensorboard writer
+    tb_writer = SummaryWriter()
+    train_loader = iter(train_dataloader)
+    epoch_loss = 0
+    total_loss = 0
+    start_time = time.time()
+    with Progress() as progress:
+        task = progress.add_task("[red]Training...", total=args.max_steps)
+        for step in range(0, args.max_steps):
+            batch_start = time.time()
+            batch = next(train_loader)
+            outputs = fsdp_model(**batch, self_attn_padding_mask=batch["attention_mask"])
+            # Shift so that tokens < n predict n
+            outputs = torch.cat([outputs[:, :1], outputs[:, 67:]], dim=1).contiguous()
+            # shift_logits = outputs[..., :-1, :].contiguous()
+            # shift_labels = batch["labels"][..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            one_hot_labels = torch.nn.functional.one_hot(batch["labels"][:, 1:], num_classes=32002).float()
+            loss = loss_fct(outputs[:,:-1], one_hot_labels)
+            epoch_loss += loss.detach().float()
+            accelerator.backward(loss)
+            optimizer.step()
+            optimizer.zero_grad()
+            batch_end = time.time()
+            logs = {
+                "loss": loss.item(),
+                "perplexity": torch.exp(loss).item(),
+                "lr": lr_scheduler.get_last_lr()[0],
+                "examples": args.batch_size * (step + 1),
+                "examples_per_second": args.batch_size / (batch_end - batch_start),
+            }
+            if step % args.log_every == args.log_every - 1:
+                #log metrics to wandb
+                wandb.log(logs, step=step)
+                #log metrics to tensorboard
+                                # Log metrics to TensorBoard
+                tb_writer.add_scalar("loss", logs["loss"], step)
+                tb_writer.add_scalar("perplexity", logs["perplexity"], step)
+                tb_writer.add_scalar("lr", logs["lr"], step)
+                tb_writer.add_scalar("examples", logs["examples"], step)
+                tb_writer.add_scalar("examples_per_second", logs["examples_per_second"], step)
+                #accelerator
+                accelerator.log(logs, step=step)
+                progress.update(task, advance=1, description=f"Step Loss: {loss.item():.5f} "
+                                                             f"| Mean Loss: {(total_loss + epoch_loss) / step:.5f} "
+                                                             f"| Mean PPL: {torch.exp((total_loss + epoch_loss) / step):.2f} "
+                                                             f"| Examples: {args.batch_size * (step + 1)} "
+                                                             f"| Examples/s: {args.batch_size / (batch_end - batch_start):.2f} "
+                                                             f"| Elapsed: {time.strftime('%H:%M:%S', time.gmtime(time.time() - start_time))}")
+            if step % args.save_every == args.save_every - 1:
+                train_epoch_loss = epoch_loss / args.save_every
+                total_loss += epoch_loss
+                epoch_loss = 0
+                accelerator.log({
+                    "train_ppl": torch.exp(train_epoch_loss),
+                    "train_epoch_loss": train_epoch_loss,
+                }, step=step)
+                progress.print(f"Saving checkpoint at step {step}...")
+                accelerator.save_state(
+                    f"{args.checkpoint_dir}/checkpoint_at_step_{step}/")
+                #save the model weights to s3
+                save_model_to_s3(model, "kosmostraining", "kosmosv1/checkpoints", step)
+                print(f"Saved to s3: {save_model_to_s3} ")
+        #finish tensorboard writer
+        tb_writer.close()
+        #finish wnabd run
+        wandb.finish()
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_dir", type=str, default="checkpoints")
+    parser.add_argument("--learning_rate", type=float, default=1e-5)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--warmup_steps", type=int, default=0)
+    parser.add_argument("--max_steps", type=int, default=100000)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--log_every", type=int, default=1)
+    parser.add_argument("--save_every", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=None)
+    parser.add_argument("--use_ddp", action="store_true", help="Use DistributedDataParallel")
+    args = parser.parse_args()
+    train(args)

Andromeda/old/training_1.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import math
+import multiprocessing
+import os
+from datetime import timedelta
+from functools import partial
+from itertools import chain
+from accelerate import Accelerator
+from accelerate.utils import InitProcessGroupKwargs
+from datasets import concatenate_datasets, load_dataset
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (AutoTokenizer, default_data_collator,
+                          get_cosine_schedule_with_warmup,
+                          get_linear_schedule_with_warmup, set_seed)
+# from stable_adamw import StableAdamWUnfused
+# sd
+from optimus_prime import Transformer, Decoder, AutoregressiveWrapper
+from optimus_prime import AndromedaEmbedding
+from lion_pytorch import Lion
+# constants
+class CFG:
+    BATCH_SIZE: int = 3 # 3
+    GRADIENT_ACCUMULATE_EVERY: int = 1
+    SEED: int = 42
+    LEARNING_RATE: float = 1e-4
+    WEIGHT_DECAY: float = 1e-2
+    SEQ_LEN: int = 8192 # 8192
+    NUM_CPU: int = multiprocessing.cpu_count()
+    USE_PRETOKENIZED: bool = True
+    USE_ACTIVATION_CHECKPOINTING: bool = True
+    RESUME_FROM_CHECKPOINT: str = None
+    CHECKPOINTING_STEPS: int = 1000
+    OUTPUT_DIR: str = "output"
+    ENTITY_NAME: str = "wanb" # Put your wandb username here
+# deepspeed_plugin = DeepSpeedPlugin(zero_stage=2, gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY)
+# helpers
+def print_num_params(model, accelerator: Accelerator):
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    accelerator.print(f"Number of parameters in model: {n_params}")
+def fsdp_activation_checkpointing(
+    model, accelerator: Accelerator, offload_to_cpu=False
+):
+    accelerator.print("Using FSDP activation checkpointing")
+    # check_fn = lambda submodule: isinstance(submodule, ParallelTransformerBlock)
+    non_reentrant_wrapper = partial(
+        checkpoint_wrapper,
+        offload_to_cpu=offload_to_cpu,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=non_reentrant_wrapper)
+def get_lr_scheduler_with_warmup(
+    optimizer, scheduler_type, num_warmup_steps, max_train_steps, grad_accumulate_every
+):
+    NUM_WARMUP_STEPS = num_warmup_steps
+    GRADIENT_ACCUMULATE_EVERY = grad_accumulate_every
+    if scheduler_type == "linear":
+        return get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    elif scheduler_type == "cosine":
+        return get_cosine_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'linear' or 'cosine', got: {}".format(
+                scheduler_type
+            )
+        )
+def build_dataloaders():
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    dataset = load_dataset("openwebtext", split="train")
+    tokenized_dataset = dataset.map(
+        lambda example: tokenizer([t + tokenizer.eos_token for t in example["text"]]),
+        batched=True,
+        num_proc=CFG.NUM_CPU,
+        remove_columns=["text"],
+    )
+    block_size = CFG.SEQ_LEN
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+    train_dataset = tokenized_dataset.map(
+        group_texts, batched=True, num_proc=CFG.NUM_CPU,
+    )
+    return train_dataset
+# main
+def TrainAndromeda():
+    # accelerator
+    timeout = InitProcessGroupKwargs(timeout=timedelta(seconds=1_000_000))
+    accelerator = Accelerator(
+        gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY,
+        mixed_precision="fp16",
+        log_with="wandb",
+        kwargs_handlers=[timeout],
+        deepspeed_plugin=deepspeed_plugin
+    )
+    accelerator.init_trackers(
+        project_name="andromeda",
+        config={
+            "batch_size": CFG.BATCH_SIZE,
+            "gradient_accumulate_every": CFG.GRADIENT_ACCUMULATE_EVERY,
+            "learning_rate": CFG.LEARNING_RATE,
+            "seq_len": CFG.SEQ_LEN,
+        },
+        init_kwargs={"wandb": {"entity": CFG.ENTITY_NAME}}
+    )
+    accelerator.print(f"Total GPUS: {accelerator.num_processes}")
+    # set seed
+    set_seed(CFG.SEED)
+    # Create the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    # instantiate andromeda
+    model = Transformer(
+        num_tokens=64007,
+        max_seq_len=8192,
+        use_abs_pos_emb=False,
+        tokenizer=tokenizer, # !
+        embedding_provider=AndromedaEmbedding(),
+        attn_layers = Decoder(
+            dim=128, # 2048
+            depth=8, # 16
+            dim_head=128,
+            heads=8,
+            alibi_pos_bias=True,
+            alibi_num_heads=4,
+            rotary_xpos=True,
+            attn_flash = True,
+            deepnorm=True,
+            shift_tokens=1,
+            attn_one_kv_head = True,
+            qk_norm=True,
+            attn_qk_norm=True,
+            attn_qk_norm_dim_scale=True # set this to True, in addition to `attn_qk_norm = True`
+        )
+    ).to(accelerator.device)
+    model = AutoregressiveWrapper(model).to(accelerator.device)
+    optim = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2, use_triton=True)
+    print_num_params(model, accelerator)
+    if CFG.USE_ACTIVATION_CHECKPOINTING:
+        fsdp_activation_checkpointing(model, accelerator)
+    # dataloaders
+    if CFG.USE_PRETOKENIZED:
+        d0 = load_dataset("conceptofmind/c4_0-to-20_neox_with_eos_8k", split="train")
+        d1 = load_dataset("conceptofmind/c4_21-to-40_neox_with_eos_8k", split="train")
+        d2 = load_dataset("conceptofmind/c4_41-to-60_neox_with_eos_8k", split="train")
+        d3 = load_dataset("conceptofmind/c4_61-to-80_neox_with_eos_8k", split="train")
+        d4 = load_dataset("conceptofmind/c4_81-to-100_neox_with_eos_8k", split="train")
+        train_dataset = concatenate_datasets([d0, d1, d2, d3, d4])
+    else:
+        train_dataset = build_dataloaders()
+    train_loader = DataLoader(
+        train_dataset, batch_size=CFG.BATCH_SIZE, collate_fn=default_data_collator,
+    )
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps: {max_train_steps}")
+    # lr scheduler
+    # We cant decide on an actual number
+    NUM_WARMUP_STEPS = int(max_train_steps * 0.01)
+    accelerator.print(f"Num warmup steps: {NUM_WARMUP_STEPS}")
+    lr_scheduler = get_lr_scheduler_with_warmup(
+        optimizer=optim,
+        scheduler_type="cosine",
+        num_warmup_steps=NUM_WARMUP_STEPS,
+        max_train_steps=max_train_steps,
+        grad_accumulate_every=CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    # prepare
+    model, optim, train_loader, lr_scheduler = accelerator.prepare(
+        model, optim, train_loader, lr_scheduler
+    )
+    # checkpoint scheduler
+    accelerator.register_for_checkpointing(lr_scheduler)
+    # I do not know why Huggingface recommends recalculation of max_train_steps
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps recalculated: {max_train_steps}")
+    # Total batch size for logging
+    total_batch_size = (
+        CFG.BATCH_SIZE * accelerator.num_processes * CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    accelerator.print(f"Total batch size: {total_batch_size}")
+    # resume training
+    progress_bar = tqdm(
+        range(max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+    if CFG.RESUME_FROM_CHECKPOINT:
+        if CFG.RESUME_FROM_CHECKPOINT is not None or CFG.RESUME_FROM_CHECKPOINT != "":
+            accelerator.print(f"Resuming from checkpoint {CFG.RESUME_FROM_CHECKPOINT}")
+            accelerator.load_state(CFG.RESUME_FROM_CHECKPOINT)
+            path = os.path.basename(CFG.RESUME_FROM_CHECKPOINT)
+        training_difference = os.path.splitext(path)[0]
+        # need to multiply `gradient_accumulation_steps` to reflect real steps
+        resume_step = (
+            int(training_difference.replace("step_", ""))
+            * CFG.GRADIENT_ACCUMULATE_EVERY
+        )
+    if CFG.RESUME_FROM_CHECKPOINT and resume_step is not None:
+        train_loader = accelerator.skip_first_batches(train_loader, resume_step)
+        completed_steps += resume_step
+        progress_bar.update(resume_step)
+    # training
+    model.train()
+    for step, batch in enumerate(train_loader):
+        with accelerator.accumulate(model):
+            inputs = batch["input_ids"].to(accelerator.device)
+            _, loss = model(inputs, return_loss=True)
+            accelerator.backward(loss)
+            # print(loss.item())
+            accelerator.log({"loss": loss.item()}, step=step)
+            if accelerator.sync_gradients:
+                accelerator.clip_grad_norm_(model.parameters(), 0.5)
+            optim.step()
+            lr_scheduler.step()
+            optim.zero_grad()
+        if accelerator.sync_gradients:
+            progress_bar.update(1)
+            completed_steps += 1
+        if isinstance(CFG.CHECKPOINTING_STEPS, int):
+            if completed_steps % CFG.CHECKPOINTING_STEPS == 0:
+                output_dir = f"step_{completed_steps }"
+                if CFG.OUTPUT_DIR is not None:
+                    output_dir = os.path.join(CFG.OUTPUT_DIR, output_dir)
+                accelerator.save_state(output_dir)
+        if completed_steps >= max_train_steps:
+            break
+    # end training
+    accelerator.print("Training Finished")
+    accelerator.end_training()
+    # save final model
+    # accelerator.print(f"Saving model to {CFG.OUTPUT_DIR}")
+    if CFG.OUTPUT_DIR is not None:
+        base_path = f'{CFG.OUTPUT_DIR}/final'
+        if not os.path.exists(base_path):
+            os.makedirs(base_path)
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        with accelerator.main_process_first():
+            accelerator.save(
+                unwrapped_model.state_dict(), os.path.join(base_path, 'final_model.pt')
+            )
+if __name__ == "__main__":
+    TrainAndromeda()

Andromeda/old/training_sophia.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import math
+import multiprocessing
+import os
+from datetime import timedelta
+from functools import partial
+from itertools import chain
+from accelerate import Accelerator
+from accelerate.utils import InitProcessGroupKwargs
+from datasets import concatenate_datasets, load_dataset
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (AutoTokenizer, default_data_collator,
+                          get_cosine_schedule_with_warmup,
+                          get_linear_schedule_with_warmup, set_seed)
+# from stable_adamw import StableAdamWUnfused
+# sd
+from optimus_prime import Transformer, Decoder, AutoregressiveWrapper
+from optimus_prime import AndromedaEmbedding
+from sophia import SophiaG
+# constants
+class CFG:
+    BATCH_SIZE: int = 3 # 3
+    GRADIENT_ACCUMULATE_EVERY: int = 1
+    SEED: int = 42
+    LEARNING_RATE: float = 1e-4
+    WEIGHT_DECAY: float = 1e-2
+    SEQ_LEN: int = 8192 # 8192
+    NUM_CPU: int = multiprocessing.cpu_count()
+    USE_PRETOKENIZED: bool = True
+    USE_ACTIVATION_CHECKPOINTING: bool = True
+    RESUME_FROM_CHECKPOINT: str = None
+    CHECKPOINTING_STEPS: int = 1000
+    OUTPUT_DIR: str = "output"
+    ENTITY_NAME: str = "nicolo" # Put your wandb username here
+# helpers
+def print_num_params(model, accelerator: Accelerator):
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    accelerator.print(f"Number of parameters in model: {n_params}")
+def fsdp_activation_checkpointing(
+    model, accelerator: Accelerator, offload_to_cpu=False
+):
+    accelerator.print("Using FSDP activation checkpointing")
+    # check_fn = lambda submodule: isinstance(submodule, ParallelTransformerBlock)
+    non_reentrant_wrapper = partial(
+        checkpoint_wrapper,
+        offload_to_cpu=offload_to_cpu,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=non_reentrant_wrapper)
+def get_lr_scheduler_with_warmup(
+    optimizer, scheduler_type, num_warmup_steps, max_train_steps, grad_accumulate_every
+):
+    NUM_WARMUP_STEPS = num_warmup_steps
+    GRADIENT_ACCUMULATE_EVERY = grad_accumulate_every
+    if scheduler_type == "linear":
+        return get_linear_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    elif scheduler_type == "cosine":
+        return get_cosine_schedule_with_warmup(
+            optimizer=optimizer,
+            num_warmup_steps=NUM_WARMUP_STEPS * GRADIENT_ACCUMULATE_EVERY,
+            num_training_steps=max_train_steps * GRADIENT_ACCUMULATE_EVERY
+        )
+    else:
+        raise ValueError(
+            "Invalid scheduler_type. Expected 'linear' or 'cosine', got: {}".format(
+                scheduler_type
+            )
+        )
+def build_dataloaders():
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    content_column = 'text'
+    dataset = load_dataset("sentiment140", split="train")
+    dataset = dataset.remove_columns([col for col in dataset.column_names if col != content_column])
+    tokenized_dataset = dataset.map(
+        lambda example: tokenizer([t + tokenizer.eos_token for t in example[content_column]]),
+        batched=True,
+        num_proc=CFG.NUM_CPU,
+        remove_columns=[content_column]
+    )
+    block_size = CFG.SEQ_LEN
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {}
+        for k in examples.keys():
+            concatenated_examples[k] = list(chain(*examples[k]))
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+    train_dataset = tokenized_dataset.map(
+        group_texts, batched=True, num_proc=CFG.NUM_CPU
+    )
+    return train_dataset
+# main
+def TrainAndromeda():
+    # accelerator
+    timeout = InitProcessGroupKwargs(timeout=timedelta(seconds=1_000_000))
+    accelerator = Accelerator(
+        gradient_accumulation_steps=CFG.GRADIENT_ACCUMULATE_EVERY,
+        mixed_precision="fp16", # Switch to bf16
+        log_with="wandb",
+        kwargs_handlers=[timeout]
+    )
+    accelerator.init_trackers(
+        project_name="andromeda",
+        config={
+            "batch_size": CFG.BATCH_SIZE,
+            "gradient_accumulate_every": CFG.GRADIENT_ACCUMULATE_EVERY,
+            "learning_rate": CFG.LEARNING_RATE,
+            "seq_len": CFG.SEQ_LEN,
+        },
+        init_kwargs={"wandb": {"entity": CFG.ENTITY_NAME}}
+    )
+    accelerator.print(f"Total GPUS: {accelerator.num_processes}")
+    # set seed
+    set_seed(CFG.SEED)
+    # Create the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    # instantiate andromeda
+    model = Transformer(
+        num_tokens=64007,
+        max_seq_len=8192,
+        use_abs_pos_emb=False,
+        tokenizer=tokenizer, # !
+        embedding_provider=AndromedaEmbedding(),
+        attn_layers = Decoder(
+            dim=128, # 2048
+            depth=8, # 16
+            dim_head=128,
+            heads=8,
+            alibi_pos_bias=True,
+            alibi_num_heads=4,
+            rotary_xpos=True,
+            attn_flash = True,
+            # deepnorm=True,
+            shift_tokens=1,
+            attn_one_kv_head = True,
+            qk_norm=True,
+            attn_qk_norm=True,
+            attn_qk_norm_dim_scale=True # set this to True, in addition to `attn_qk_norm = True`
+        )
+    ).to(accelerator.device)
+    model = AutoregressiveWrapper(model).to(accelerator.device)
+    #optim = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2)
+    optim = SophiaG(model.parameters(), lr=1e-5, weight_decay=1e-1)
+    print_num_params(model, accelerator)
+    if CFG.USE_ACTIVATION_CHECKPOINTING:
+        fsdp_activation_checkpointing(model, accelerator)
+    # dataloaders
+    if CFG.USE_PRETOKENIZED:
+        d0 = load_dataset("conceptofmind/c4_0-to-20_neox_with_eos_8k", split="train")
+        d1 = load_dataset("conceptofmind/c4_21-to-40_neox_with_eos_8k", split="train")
+        d2 = load_dataset("conceptofmind/c4_41-to-60_neox_with_eos_8k", split="train")
+        d3 = load_dataset("conceptofmind/c4_61-to-80_neox_with_eos_8k", split="train")
+        d4 = load_dataset("conceptofmind/c4_81-to-100_neox_with_eos_8k", split="train")
+        train_dataset = concatenate_datasets([d0, d1, d2, d3, d4])
+    else:
+        train_dataset = build_dataloaders()
+    train_loader = DataLoader(
+        train_dataset, batch_size=CFG.BATCH_SIZE, collate_fn=default_data_collator,
+    )
+    # optimizer
+    # optim = decoupled_optimizer(
+    #     model,
+    #     learning_rate=CFG.LEARNING_RATE,
+    #     weight_decay=CFG.WEIGHT_DECAY,
+    #     beta_1=0.9,
+    #     beta_2=0.95,
+    #     use_adamw=False,
+    # )
+    # Determine number of training steps
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps: {max_train_steps}")
+    # lr scheduler
+    # We cant decide on an actual number
+    NUM_WARMUP_STEPS = int(max_train_steps * 0.01)
+    accelerator.print(f"Num warmup steps: {NUM_WARMUP_STEPS}")
+    lr_scheduler = get_lr_scheduler_with_warmup(
+        optimizer=optim,
+        scheduler_type="cosine",
+        num_warmup_steps=NUM_WARMUP_STEPS,
+        max_train_steps=max_train_steps,
+        grad_accumulate_every=CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    # prepare
+    model, optim, train_loader, lr_scheduler = accelerator.prepare(
+        model, optim, train_loader, lr_scheduler
+    )
+    # checkpoint scheduler
+    accelerator.register_for_checkpointing(lr_scheduler)
+    # I do not know why Huggingface recommends recalculation of max_train_steps
+    max_train_steps = math.ceil(len(train_loader) / CFG.GRADIENT_ACCUMULATE_EVERY)
+    accelerator.print(f"Max train steps recalculated: {max_train_steps}")
+    # Total batch size for logging
+    total_batch_size = (
+        CFG.BATCH_SIZE * accelerator.num_processes * CFG.GRADIENT_ACCUMULATE_EVERY
+    )
+    accelerator.print(f"Total batch size: {total_batch_size}")
+    # resume training
+    progress_bar = tqdm(
+        range(max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+    if CFG.RESUME_FROM_CHECKPOINT:
+        if CFG.RESUME_FROM_CHECKPOINT is not None or CFG.RESUME_FROM_CHECKPOINT != "":
+            accelerator.print(f"Resuming from checkpoint {CFG.RESUME_FROM_CHECKPOINT}")
+            accelerator.load_state(CFG.RESUME_FROM_CHECKPOINT)
+            path = os.path.basename(CFG.RESUME_FROM_CHECKPOINT)
+        training_difference = os.path.splitext(path)[0]
+        # need to multiply `gradient_accumulation_steps` to reflect real steps
+        resume_step = (
+            int(training_difference.replace("step_", ""))
+            * CFG.GRADIENT_ACCUMULATE_EVERY
+        )
+    if CFG.RESUME_FROM_CHECKPOINT and resume_step is not None:
+        train_loader = accelerator.skip_first_batches(train_loader, resume_step)
+        completed_steps += resume_step
+        progress_bar.update(resume_step)
+    # training
+    model.train()
+    for step, batch in enumerate(train_loader):
+        with accelerator.accumulate(model):
+            inputs = batch["input_ids"].to(accelerator.device)
+            _, loss = model(inputs, return_loss=True)
+            accelerator.backward(loss)
+            # print(loss.item())
+            accelerator.log({"loss": loss.item()}, step=step)
+            if accelerator.sync_gradients:
+                accelerator.clip_grad_norm_(model.parameters(), 0.5)
+            optim.step()
+            lr_scheduler.step()
+            optim.zero_grad()
+        if accelerator.sync_gradients:
+            progress_bar.update(1)
+            completed_steps += 1
+        if isinstance(CFG.CHECKPOINTING_STEPS, int):
+            if completed_steps % CFG.CHECKPOINTING_STEPS == 0:
+                output_dir = f"step_{completed_steps }"
+                if CFG.OUTPUT_DIR is not None:
+                    output_dir = os.path.join(CFG.OUTPUT_DIR, output_dir)
+                accelerator.save_state(output_dir)
+        if completed_steps >= max_train_steps:
+            break
+    # end training
+    accelerator.print("Training Finished")
+    accelerator.end_training()
+    # save final model
+    # accelerator.print(f"Saving model to {CFG.OUTPUT_DIR}")
+    if CFG.OUTPUT_DIR is not None:
+        base_path = f'{CFG.OUTPUT_DIR}/final'
+        if not os.path.exists(base_path):
+            os.makedirs(base_path)
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        with accelerator.main_process_first():
+            accelerator.save(
+                unwrapped_model.state_dict(), os.path.join(base_path, 'final_model.pt')
+            )
+if __name__ == "__main__":
+    TrainAndromeda()

Andromeda/train.py CHANGED Viewed

@@ -6,50 +6,45 @@ from functools import partial
 from itertools import chain
 import torch
-# import bitsandbytes as bnb
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel,
-    MixedPrecision,
-    BackwardPrefetch,
-    ShardingStrategy,
-)
 from accelerate import Accelerator
-from accelerate.utils import (DummyOptim, InitProcessGroupKwargs)
 from accelerate.logging import get_logger
 from datasets import load_dataset
 from lion_pytorch import Lion
-from torch.nn import LayerNorm
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper)
-from torch.distributed.fsdp.wrap import (
-    transformer_auto_wrap_policy
 )
 from torch.optim import AdamW
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from transformers import (AutoTokenizer, default_data_collator,
-                          get_cosine_schedule_with_warmup,
-                          get_linear_schedule_with_warmup, set_seed)
-from Andromeda.utils.stable_adamw import StableAdamWUnfused
-from Andromeda.core.transformer import Transformer, AndromedaEmbedding
 # from Andromeda.model import Andromeda
-from Andromeda.model import AndromedaEmbedding #, Andromeda
 from Andromeda.configs import Andromeda1Billion
-########### SETUP CONFIG
-import torch.distributed as dist
-from accelerate.state import AcceleratorState
 # state = AcceleratorState()
@@ -686,7 +681,7 @@ def Train():
             )
-def main():
     os.environ['MASTER_ADDR'] #'localhost'
     os.environ['MASTER_PORT'] #= '9994'
@@ -702,4 +697,4 @@ def main():
     Train()
 if __name__ == '__main__':
-    main()

 from itertools import chain
 import torch
+########### SETUP CONFIG
+import torch.distributed as dist
 from accelerate import Accelerator
 from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import DummyOptim, InitProcessGroupKwargs
 from datasets import load_dataset
 from lion_pytorch import Lion
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl,
+    apply_activation_checkpointing,
+    checkpoint_wrapper,
 )
+# import bitsandbytes as bnb
+from torch.distributed.fsdp import (
+    BackwardPrefetch,
+    FullyShardedDataParallel,
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.nn import LayerNorm
 from torch.optim import AdamW
 from torch.utils.data import DataLoader
 from tqdm import tqdm
+from transformers import (
+    AutoTokenizer,
+    default_data_collator,
+    get_cosine_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+    set_seed,
+)
 # from Andromeda.model import Andromeda
 from Andromeda.configs import Andromeda1Billion
+from Andromeda.core.transformer import Transformer
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
 # state = AcceleratorState()
             )
+def train():
     os.environ['MASTER_ADDR'] #'localhost'
     os.environ['MASTER_PORT'] #= '9994'
     Train()
 if __name__ == '__main__':
+    train()

Andromeda/utils/__init__.py ADDED Viewed

File without changes

Andromeda/utils/decoupled_optimizer.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+# from palm_rlhf_pytorch.palm import LayerNorm
+from torch.nn import LayerNorm
+from torch.optim import AdamW
+# from palm.utils import print_main
+from Andromeda.utils.helpers import print_main
+from Andromeda.utils.stable_adamw import StableAdamWUnfused
+# optimizers
+def decoupled_optimizer(
+    model: torch.nn.Module,
+    learning_rate: float,
+    weight_decay: float = 0.1,
+    beta_1: float = 0.90,
+    beta_2: float = 0.95,
+    optimizer_type: str = "adamw",
+    use_fsdp: bool = True,
+):
+    """
+    Decouples the optimizer from the training process.
+    This function sets up the optimizer for the model by creating two groups of parameters:
+    one for weight decay and one without weight decay. Then, it initializes the optimizer
+    with these two groups of parameters.
+    Args:
+        model (Module): The model whose parameters are optimized.
+        learning_rate (float): The learning rate for the optimizer.
+        weight_decay (float): The weight decay for the optimizer.
+        beta_1 (float): The exponential decay rate for the 1st moment estimates.
+        beta_2 (float): The exponential decay rate for the 2nd moment estimates.
+        optimizer_type (str): The type of the optimizer. Can be 'lion', 'adamw', or 'stable_adamw'.
+        use_fsdp (bool, optional): If True, the optimizer will work with fully sharded data parallelism. Defaults to True.
+        accelerator (Accelerator, optional): The accelerator from HuggingFace's Accelerate library. Defaults to None.
+    Returns:
+        Optimizer: The initialized optimizer.
+    Raises:
+        ValueError: If the optimizer type is not 'lion', 'adamw' or 'stable_adamw'.
+    """
+    print_main(f"Using {optimizer_type} optimizer")
+    # Create an empty dictionary called param_dict to store the model's named parameters.
+    param_dict = {}
+    # Iterate over the model's named parameters and populate the param_dict with key-value pairs.
+    for param_name, param in model.named_parameters():
+        print_main(param_name)
+        param_dict[param_name] = param
+    # Separate the model's named modules into two groups: decay and no_decay.
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+    no_decay = []
+    if use_fsdp:
+        exclude_module = "_fsdp_wrapped_module.token_emb"
+    else:
+        exclude_module = "token_emb"
+    # Iterate through the named modules of the model.
+    for module_name, module in model.named_modules():
+        # Check if the current module is an instance of any of the desired types (LayerNorm or torch.nn.Embedding).
+        for ndim in [LayerNorm, torch.nn.Embedding]:
+            if isinstance(module, ndim):
+                # If torch.nn.Embedding, append its name with a ".weight" suffix to the no_decay list.
+                if module_name == exclude_module:
+                    no_decay.append(f"{module_name}.weight")
+                else:
+                    # If the module is an instance of LayerNorm
+                    no_decay.append(f"{module_name}.gamma")
+                # Exit the inner loop since the desired module has been found.
+                break
+    # Create an empty list to store the names of the Linear layer weights with weight decay.
+    decay = []
+    # Iterate through the named modules of the model.
+    for module_name, module in model.named_modules():
+        # Check if the current module is an instance of the desired type (torch.nn.Linear).
+        for ndim in [torch.nn.Linear]:
+            if isinstance(module, ndim):
+                # If the module is an instance of torch.nn.Linear, append its name with a ".weight" suffix to the decay list.
+                decay.append(f"{module_name}.weight")
+                # Exit the inner loop since the desired module has been found.
+                break
+    # Create two separate lists of model parameters: decay_param and no_decay_param.
+    # The decay_param list contains the parameters that should have weight decay applied.
+    # The no_decay_param list contains the parameters that should not have weight decay applied, excluding the 'to_logits.weight' parameter.
+    # Create an empty list called decay_param to store the parameters with weight decay.
+    decay_param = []
+    if use_fsdp:
+        exclude_param = "_fsdp_wrapped_module.to_logits.weight"
+    else:
+        exclude_param = "to_logits.weight"
+    # Iterate over the decay list, which contains the names of the parameters with weight decay.
+    for param in decay:
+        # Check if the current parameter is not 'to_logits.weight'.
+        # Append the corresponding parameter from param_dict to the decay_param list.
+        if param != exclude_param:
+            decay_param.append(param_dict[param])
+    # Create an empty list called no_decay_param to store the parameters without weight decay.
+    no_decay_param = []
+    # Iterate over the no_decay list, which contains the names of the parameters without weight decay.
+    for param in no_decay:
+        # Append the corresponding parameter from param_dict to the no_decay_param list.
+        no_decay_param.append(param_dict[param])
+    # Create a list called grouped_params that contains two dictionaries.
+    # The first dictionary has the decay_param list and the corresponding weight_decay value.
+    # The second dictionary has the no_decay_param list and a weight_decay value of 0.0.
+    grouped_params = [
+        {"params": decay_param, "weight_decay": weight_decay},
+        {"params": no_decay_param, "weight_decay": 0.0},
+    ]
+    # Create a variable called optimizer that stores an instance of the optimizer.
+    if optimizer_type == "adamw":
+        optimizer = AdamW(
+            grouped_params,
+            lr=learning_rate,
+            betas=(beta_1, beta_2),
+        )
+    elif optimizer_type == "stable_adamw":
+        optimizer = StableAdamWUnfused(
+            grouped_params,
+            lr=learning_rate,
+            betas=(beta_1, beta_2),
+        )
+    else:
+        raise ValueError(
+            "Invalid optimizer_type. Expected 'lion', 'adamw', 'deepspeed' or 'stable_adamw', got: {}".format(
+                optimizer_type
+            )
+        )
+    # Return the optimizer.
+    return optimizer

Andromeda/utils/helpers.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch.distributed as dist  # Add this line
+def print_num_params(model):
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    if dist.is_available():
+        if dist.get_rank() == 0:
+            print(f"Number of parameters in model: {n_params}")
+    else:
+        print(f"Number of parameters in model: {n_params}")
+def print_main(msg):
+    if dist.is_available():
+        if dist.get_rank() == 0:
+            print(msg)
+    else:
+        print(msg)

Andromeda/utils/rf_utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import math
+import torch
+from torch import einsum, _nnpack_available
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+import copy
+from pathlib import PurePath
+from tqdm import tqdm_gui
+from beartype import beartype
+from beartype.typing import Tuple, Optional
+from einops import rearrange, repeat, reduce, unpack
+from einops.layers.torch import Rearrange, Reduce
+#helpers
+def exists(val):
+    return val is not None
+#decorators
+def eval_decorator(fn):
+    def inner(self, *args, **kwargs):
+        was_training = self.training
+        self.eval()
+        out = fn(self, *args, **kwargs)
+        self.train(was_training)
+        return out
+    return inner
+def defaults(val, d):
+    return val if exists(val) else d
+#tensor helpers
+def log(t, eps=1e-20):
+    return torch.log(t.clamp(min = eps))
+def masked_mean(seq, mask=None, dim=1, keepdim=True):
+    if not exists(mask):
+        return seq.mean(dim=dim)
+    if seq.ndim == 3:
+        mask = rearrange(mask, 'b n -> b n 1')
+    masked_seq = seq.masked_fill(~mask, 0.)
+    numer = masked_seq.sum(dim=dim, keepdim=keepdim)
+    denom = mask.sum(dim=dim, keepdim=keepdim)
+    masked_mean = numer / denom.clamp(min = 1e-3)
+    masked_mean = masked_mean.masked_fill(denom == 0, 0.)
+    return masked_mean
+#sampling helpers
+def gumbel_noise(t):
+    noise = torch.zeros_like(t).uniform(0, 1)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature = 1., dim=-1):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
+def top_p(logits, thres=0.9):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.einsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cum_probs > (1 - thres)
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+    sorted_logits[sorted_indices_to_remove] = float("-inf")
+    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
+def top_k(logits, thres=0.9):
+    k = math.ceil((1 - thres) * logits.shape[-1])
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+class LoRA(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        r=8,
+        alpha=None
+    ):
+        super().__init__()
+        alpha = defaults(alpha, r)
+        self.scale = alpha / r
+        self.A = nn.Parameter(torch.randn(dim, r))
+        self.B = nn.Parameter(torch.zeros(r, dim_out))
+#reward model
+@beartype
+class RewardModel(nn.Module):
+    def __init__(
+            self,
+            model: Andromeda,
+            dropout=0.1,
+            num_binned_output = 0.,
+            use_lora = True,
+            lora_r = 8,
+            reward_lora_scope = 'reward',
+    ):
+        super().__init__()
+        self.model = copy.deepcopy(Andromeda)
+        self.model.set_dropout(dropout)
+        self.reward_lora_scope = reward_lora_scope is use_lora else None
+        if exists(self.reward_lora_scope):
+            self.model.add_finetune_params(reward_lora_scope, lora_r = lora_r)
+        dim = model.dim
+        self.binned_output = num_binned_output > 1
+        self.prompt_embed = nn.Parameter(torch.zeros(1, 1, dim))
+        self.response_embed = nn.Parameter(torch.zeros(1, 1, dim))
+        if self.binned_output:
+            self.to_pred = nn.Linear(dim, num_binned_output)
+        else:
+            self.to_pred = nn.Sequential(
+                nn.Linear(dim, 1, bias=False),
+                Rearrange('... 1 -> ...')
+            )
+    def load(self, path):
+        path = Path(path)
+        assert path.exists()
+        self.load_state_dict(torch.load(str(path)))
+    def finetune_parameters(self):
+        return (
+            *self.to_pred.parameters(),
+            *(self.model.finetune_parameters(self.reward_lora_scope) if exists(self.reward_lora_scope) else model.parameters())
+        )
+    def forward(
+            self,
+            x,
+            mask=None,
+            prompt_mask=None,
+            prompt_lengths=None,
+            labels=None,
+            sample=False,
+            sample_temperature=1.,
+            disable_lora=False
+    ):
+        assert not (exists(prompt_mask) and exists(prompt_lengths))
+        #derive prompt mask from prompt lengths
+        if exists(prompt_lengths):
+            batch, seq_len = x.shape
+            arange = torch.arange(seq_len, device = x.device)
+            prompt_mask = repeat(arange, 'n -> n n', b = batch) > rearrange(prompt_lengths, 'b -> b 1')
+        #rward model should have an understand of which section is prompt and which section is repsonse
+        extra_embed = None
+        if exists(prompt_mask):
+            extra_embed = torch.where(
+                rearrange(prompt_mask, 'b n -> b n 1'),
+                self.prompt_embed,
+                self.response_embed
+            )
+        embeds = self.model(
+            x,
+        )

Andromeda/utils/stable_adamw.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+# This is the unfused version of StableAdamW. It is slower than the fused version (coming).
+class StableAdamWUnfused(torch.optim.Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=0.002,
+        weight_decay=0.2,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        clip_thresh=1.0,
+        precision="amp_bfloat16",
+        custom_scalar=65536,
+    ):
+        beta1, beta2 = betas[0], betas[1]
+        defaults = dict(lr=lr, weight_decay=weight_decay, beta1=beta1, beta2=beta2)
+        super(StableAdamWUnfused, self).__init__(params, defaults)
+        self.eps = eps
+        self.d = clip_thresh
+        # Set precision to "custom_fp16" if you want to use a fixed loss scalar, custom_scalar, which is divided out in the update step.
+        # If you do this, call (custom_scalar * loss).backward() instead of loss.backward().
+        self.precision = precision
+        self.custom_scaler = custom_scalar
+        for group in self.param_groups:
+            group["step"] = 1.0
+        print("Using StableAdamWUnfused-v1")
+    def __setstate__(self, state):
+        super(StableAdamWUnfused, self).__setstate__(state)
+    def step(self, closure=None):
+        if closure is not None:
+            closure()
+        for group in self.param_groups:
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            beta1 = group["beta1"]
+            beta2 = group["beta2"]
+            step = group["step"]
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                theta = p.data
+                param_state = self.state[p]
+                if self.precision == "custom_fp16":
+                    g = p.grad.data / self.custom_scaler
+                    if torch.any(torch.isnan(g) | torch.isinf(g)):
+                        continue
+                else:
+                    g = p.grad.data
+                if "exp_avg" not in param_state:
+                    v = param_state["exp_avg"] = torch.zeros_like(theta)
+                    u = param_state["exp_avg_sq"] = torch.zeros_like(theta)
+                else:
+                    v = param_state["exp_avg"]
+                    u = param_state["exp_avg_sq"]
+                beta1hat = beta1 * (1 - beta1 ** (step - 1)) / (1 - beta1**step)
+                beta2hat = beta2 * (1 - beta2 ** (step - 1)) / (1 - beta2**step)
+                v = v.mul_(beta1hat).add_(g, alpha=1.0 - beta1hat)
+                u = u.mul_(beta2hat).addcmul_(g, g, value=1.0 - beta2hat)
+                denominator = u.sqrt().add_(self.eps)
+                # StableAdamW = AdamW + update clipping (https://arxiv.org/abs/1804.04235) applied tensor-wise.
+                rms = (
+                    torch.div(
+                        g.pow(2), torch.maximum(u, (self.eps**2) * torch.ones_like(u))
+                    )
+                    .mean()
+                    .sqrt()
+                    .item()
+                )
+                theta = theta.mul_(1.0 - lr * weight_decay).addcdiv_(
+                    v, denominator, value=-lr * (1.0 / max(1.0, rms / self.d))
+                )
+                # save current params
+                param_state["exp_avg"] = v
+                param_state["exp_avg_sq"] = u
+            group["step"] = step + 1

DOCs/Corporation/MONETIZATION.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Andromeda Product Brief and Monetization Strategy Document
+## Product Summary:
+Andromeda is an innovative language model designed for high performance and efficiency. It utilizes advanced techniques that allow it to process and learn from multiple sources and adapt in real-time.
+## Monetization Strategies:
+1. **Usage-based API:** Provide Andromeda as a paid API service where users pay based on the amount of computation they use.
+2. **Consulting deals:** Offer expert consulting services to businesses looking to incorporate Andromeda's capabilities into their operations.
+3. **Dedicated capacity:** Sell dedicated computational power to businesses for exclusive usage of Andromeda's capabilities.
+4. **Licensing the technology:** Allow companies to license the Andromeda model for their proprietary use.
+5. **Subscription models:** Provide access to Andromeda's capabilities on a subscription basis.
+6. **Freemium model:** Offer basic usage of Andromeda for free, while charging for advanced features and capabilities.
+7. **Partnerships:** Form strategic partnerships with tech companies that can leverage Andromeda's capabilities in their products and services.
+8. **Sponsorships:** Sponsor research projects or tech events to get visibility and promote Andromeda's services.
+9. **Training and certifications:** Offer training programs and certifications on Andromeda usage and applications.
+10. **Custom development:** Offer custom development services for businesses that want specialized applications of Andromeda.
+## Potential Customers:
+1. **Tech companies:** Andromeda can be integrated into a wide array of tech products and services.
+2. **Educational institutions:** Universities and research institutions can use Andromeda for research purposes.
+3. **Government agencies:** Andromeda can assist in processing and analyzing large amounts of data.
+4. **Healthcare providers:** Andromeda can be used in data analysis and decision making in healthcare.
+5. **Media and entertainment industry:** Andromeda's language model can be used in content creation and curation.
+## Potential Cashflow Gains:
+1. **API usage revenues:** Charging per API call can generate substantial revenues with a high number of users.
+2. **Subscription fees:** A tier-based subscription model can ensure a steady income stream.
+3. **Licensing fees:** Companies willing to license the technology can provide a significant one-time or recurring revenue.
+4. **Consulting fees:** Consulting services can yield high-value contracts.
+5. **Sponsorship revenues:** Sponsoring events or projects can yield returns in the form of new business leads and customers.
+## Expenses:
+1. **Cloud infrastructure costs:** Major expense in maintaining and scaling the Andromeda model.
+2. **Research and development:** Continual improvement of Andromeda requires ongoing investment.
+3. **Marketing and sales:** Promoting Andromeda and closing sales deals will be a recurring expense.
+4. **Operational costs:** Expenses related to managing the company, including salaries, office space, utilities, and more.
+5. **Open-source contributors:** Andromeda is built on the contributions of numerous developers. Recognizing these contributors through a rewards program is an essential part of maintaining a healthy development ecosystem.
+### Open Source Contributors:
+The following is a representative list of contributors who have helped make Agora what it is today:
+1. Kye
+2. Nicolo
+Each contributor brings unique expertise and value to the project, helping to shape Andromeda into a powerful, efficient, and intelligent language model that will revolutionize the NLP landscape.

DOCs/Design/Dyson.md ADDED Viewed

	@@ -0,0 +1,26 @@

+Insights and Techniques:
+1. Flops: The importance of considering the number of floating-point operations (FLOPs) when designing models.
+2. Flash Attention 2.0: The use of techniques like Flash Attention 2.0 cuda to enable more FLOPs in the model.
+3. Mixed Precision: Utilizing mixed precision training to improve training speed and memory efficiency.
+4. Deepspeed 3 with NVMe: Using Deepspeed 3 with NVMe for optimizing training performance.
+5. 8-bit Optimizer: Employing an 8-bit optimizer for further speed improvements.
+6. Gradient Clipping: Adding gradient clipping to achieve massive speedup during training.
+7. XPOS, ALIBI, QK Layernorm: Leveraging advanced techniques for extrapolation, interpolation, and training stabilization.
+8. Multi Query Attention: Using multi-query attention to boost decoding speed.
+9. Parallelized Transformer Blocks: Parallelizing transformer blocks to enhance overall model performance.
+10. Positional Embeddings and Shifted Tokens: The decision to not use positional embeddings and utilization of shifted tokens for sequence length advancement.
+11. Positional Interpolation: Incorporating positional interpolation for improved sequence handling.
+12. Optimized CUDA Embedding Function: Utilizing an optimized CUDA embedding function for better performance.
+13. Nebula Loss Function: Implementing the Nebula loss function, a polymorphic loss function for multi-task training.
+Possible Improvements:
+1. Clearer Metrics: To validate the model's claims, it would be beneficial to establish specific metrics for monitoring across training, especially regarding reasoning capabilities.
+2. Validation and Testing Environment: Further development and description of the exhaustive testing environment to validate the model's performance and capabilities.
+3. Comprehensive Documentation: Provide detailed documentation of the model's architecture, training methodology, and testing procedures to ensure transparency and replicability.
+4. Benchmarking Against Competitors: Perform benchmarking against existing models to showcase the advantages and differentiation offered by the proposed architecture and training techniques.
+5. Real-World Applications: Highlight potential real-world applications or use cases where the proposed model can provide superior performance compared to existing solutions.
+6. Explainability and Interpretability: Consider incorporating methods for model explainability and interpretability, especially in applications where these aspects are crucial.
+7. Addressing Specific Niche Needs: Identify specific niches or use cases where the model can excel and tailor marketing and development efforts accordingly.
+8. Collaboration and Peer Review: Engage with the research community, participate in peer review, and seek collaboration opportunities to gain additional insights and validation.

DOCs/Design/MODEL_ARCHITECTURE.md ADDED Viewed

	@@ -0,0 +1,57 @@

+### Alibi Positional Bias
+Alibi positional bias allows the model to learn relative positions between tokens, enabling it to better capture the relationships and dependencies between tokens in a sequence.
+Usage example:
+```python
+attn_layers = Decoder(
+    ...
+    alibi_pos_bias=True,
+    alibi_num_heads=4,
+    ...
+)
+```
+### Rotary Position Encodings (xpos)
+Rotary position encodings introduce a more efficient way to encode positions in the input sequence. They avoid the need for absolute positional embeddings, reducing the model's memory footprint and improving training speed.
+Usage example:
+```python
+attn_layers = Decoder(
+    ...
+    rotary_xpos=True,
+    ...
+)
+```
+### Flash Attention
+Flash attention speeds up the self-attention mechanism by reducing the number of attention computations. It accelerates training and inference while maintaining a high level of performance.
+Usage example:
+```python
+attn_layers = Decoder(
+    ...
+    attn_flash=True,
+    ...
+)
+```
+Usage example:
+```python
+attn_layers = Decoder(
+    ...
+    deepnorm=True,
+    ...
+)
+```
+### Deep Normalization (deepnorm)
+Deep normalization is a technique that normalizes the activations within a layer, helping with training stability and convergence. It allows the model to better learn complex patterns and generalize to unseen data.

DOCs/Design/SPEED.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Increasing Speed
+* Integrate Flash Attention 2.0 cuda, significant speed up
+* Utilize 8BIT Optimizer from BNB, big speed up weakness => bnb isn't compatible with all gpus
+* Use a better tokenizer TokenMonster?
+* Parallelize the transformer blocks similar to that of [PALMS](https://github.com/conceptofmind/PaLM)
+* Look into MPTS config for LION for pretraining, did they use high batch size?

DOCs/Design/Specs.md ADDED Viewed

	@@ -0,0 +1,196 @@

+## **Andromeda Specs**: Unveiling Mastery
+**Overview**
+Elegantly marrying craftsmanship and technology, Andromeda is not just another step in AI evolution. It's a giant leap. Driven by precision, powered by innovation, and defined by excellence, Andromeda is the epitome of intelligence realized. Here, we detail the marvel that is Andromeda, in numbers, facts, and logic.
+---
+### **Specifications**
+| **Feature**                                  | **Specification**                             |
+|----------------------------------------------|-----------------------------------------------|
+| **Sequence Handling**                        | Ultra Long (32,000 - 200,000+ context lengths)|
+| **Processing Speed**                         | Ultra Fast (32,000+ tokens in < 100ms)        |
+| **Reasoning Abilities**                      | Creativity, Quantitative                                      |
+| **Attention Mechanism**                      | Flash Attention 2.0 Triton                    |
+| **Memory Consumption** (compared to GPT-3)   | 100x Less                                      |
+| **Memory Consumption** (compared to LLAMA)   | 30x Less                                      |
+| **Max Sequence Processing Speed**            | 100,000+ sequences in < 300ms                 |
+| **Dataset Strategy**                         | Books, Falcon, Redpajama, Math, Code                              |
+| **Functionality**                            | FSDP, HF Accelerate,  Poetry Composition, API Calls, and more       |
+---
+### **Benchmarks**
+**Speed**: At the heart of Andromeda's unparalleled capabilities is its raw speed. Leveraging the prowess of Flash Attention 2.0 Triton, it doesn't merely process data; it blazes through it. This power allows it to consume 50x less memory than its predecessor, GPT-3, and 10x less than LLAMA.
+---
+### **Why Andromeda?**
+- **Performance**: Andromeda isn't about doing things faster; it's about doing them the best. Reliable processing of sequences, even as extensive as 100,000+ lengths, is realized in the blink of an eye, under 300ms.
+- **Precision and Creativity**: The dataset strategy is no mere algorithm. It's a symphony, meticulously crafted to offer both creativity and quantitative reasoning.
+- **Versatility**: Andromeda doesn't just compute; it contemplates. Whether you need the flair of a poet or the precision of an API call, Andromeda delivers, seamlessly.
+---
+### **Andromeda Principles**
+- **Efficiency**: It's not just about doing more; it's about doing better. Techniques like attention flashing, rotary position encodings, and deep normalization ensure every cycle, every operation, every byte is optimized for performance.
+- **Flexibility**: In the ever-evolving world of technology, adaptability is king. Andromeda is designed to mold, adapt, and excel, irrespective of the task or domain.
+- **Scalability**: Grow with you, for you. Andromeda isn't static. It's dynamic, designed to scale, accommodating growing resources and expanding data sizes.
+- **Community-Driven**: Behind Andromeda's machine brain is the human heart of the community. It doesn't just utilize open source; it thrives on it, constantly evolving, learning, and improving with contributions from around the world.
+For enthusiasts, developers, and thinkers looking to dive deeper, the Model Architecture documentation offers an exhaustive, detailed view into the intricate marvel that is Andromeda. Dive in, and witness engineering and artistry in harmony.
+---
+### **Andromeda: A Detailed Technical Overview**
+At the intersection of technological ingenuity and groundbreaking design principles, Andromeda emerges. Representing the zenith of years of research and development, it promises a transformative leap in AI performance, efficiency, and versatility. In this technical specifications document, we deconstruct the intricacies of Andromeda, presenting a meticulous overview of its structure, performance metrics, and underlying methodologies.
+## **Feature Insights**
+### **Alibi Positional Bias**
+Empowering Andromeda to discern relative positions between tokens, this feature accentuates its ability to grasp intricate relationships within a sequence.
+### **Rotary Position Encodings (xpos)**
+This is a revolutionary means of encoding positions, shrinking the model's memory demands and propelling training speeds.
+### **Flash Attention**
+This is the linchpin of Andromeda's speed prowess, minimizing attention computations, thus boosting training and inference phases.
+### **Deep Normalization (deepnorm)**
+By normalizing activations, deep normalization shores up training stability, allowing Andromeda to identify intricate patterns with finesse.
+## **Feature Insights (Contd.)**
+### **Attn One KV Head (Multiquery Attention)**
+A breakthrough in attention mechanism design, this feature allows for simultaneous computation of multiple queries against the same set of key-values, fostering speed and efficiency.
+### **QK Norm & Attention QK Norm**
+These two features introduce a normalization step in the query and key matrices. This step facilitates stabilization in the attention mechanism, rendering it more robust and enabling it to scale with larger input sizes.
+### **Attention QK Norm Dimension Scale**
+A sophisticated adjustment to the attention mechanism, it modulates the normalization scale in accordance to the dimensions of the model. The result is a more adaptive and responsive attention framework.
+### **Embedding Provider**
+At the foundation of Andromeda, this module facilitates the embedding process, converting token sequences into dense vectors. Tailored for Andromeda, it ensures rapid and efficient embedding processes.
+---
+## **Deeper Dive: Model Parameters**
+Unpacking Andromeda means diving deep into the parameters that shape its capabilities. Here's a granular view:
+| **Parameter**                           | **Description**                                                                                                                                                                           | **Default Value** |
+|-----------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|
+| **num_tokens**                          | Total number of tokens in the vocabulary.                                                                                                                                                | 50432             |
+| **max_seq_len**                         | Maximum sequence length the model can process.                                                                                                                                           | 8192              |
+| **dim**                                 | Dimension size of the model. It represents the size of embeddings and general depth in neural layers.                                                                                    | 2560              |
+| **depth**                               | Represents the number of transformer layers in the architecture.                                                                                                                         | 32                |
+| **dim_head**                            | Dimension size of each head in multi-head attention mechanism.                                                                                                                           | 128               |
+| **heads**                               | Total number of heads in multi-head attention.                                                                                                                                           | 24                |
+| **use_abs_pos_emb**                     | Boolean flag to determine if absolute positional embeddings are used.                                                                                                                     | False             |
+| **alibi_pos_bias**                      | Enables the alibi positional bias in attention mechanisms.                                                                                                                               | True              |
+| **alibi_num_heads**                     | Specifies the number of heads for the alibi positional bias.                                                                                                                             | 12                |
+| **rotary_xpos**                         | Determines if rotary positional encodings are utilized.                                                                                                                                  | True              |
+| **attn_flash**                          | Flag to activate the Flash Attention mechanism, minimizing computations in the attention phase.                                                                                          | True              |
+| **shift_tokens**                        | The number of tokens by which input sequences are shifted. Essential for certain sequence-to-sequence tasks.                                                                             | 1                 |
+| **attn_one_kv_head**                    | Activates multiquery attention by computing multiple queries against a singular key-value pair.                                                                                          | True              |
+| **qk_norm**                             | Enables the query-key normalization mechanism in the attention phase.                                                                                                                    | True              |
+| **attn_qk_norm**                        | A more advanced version of query-key normalization that scales according to the model's dimensions.                                                                                      | True              |
+| **attn_qk_norm_dim_scale**              | Modulates the scale of the aforementioned attention normalization based on the model's dimensionality.                                                                                  | True              |
+| **embedding_provider**                  | The module responsible for providing embeddings. Custom providers can be passed for tailored embedding processes.                                                                       | AndromedaEmbedding|
+---
+## **Insights and Techniques**
+#### **1. Floating-Point Operations (FLOPs)**
+Considering the number of FLOPs is paramount. It provides a metric to gauge the computational intensity and, by extension, the potential speed of the model.
+#### **2. Flash Attention 2.0 Triton**
+Enhanced with CUDA, this method offers a significant surge in the number of FLOPs the model can handle, amplifying its overall efficiency.
+#### **3. Mixed Precision Training**
+By embracing mixed precision, Andromeda realizes a noteworthy uptick in training speed while achieving commendable memory efficiency.
+#### **4. Deepspeed 3 with NVMe Integration**
+This powerful combination paves the way for superlative optimization during the training phase.
+#### **5. 8-bit Optimizer**
+Further pushing the boundaries of speed, the 8-bit optimizer boosts processing times without compromising the integrity of results.
+#### **6. Gradient Clipping**
+This technique has been integrated into the training regimen, achieving a massive speedup and preventing undesirable spikes during the process.
+#### **7. Advanced Techniques: XPOS, ALIBI, QK Layernorm**
+These sophisticated techniques are harnessed for superior extrapolation, interpolation, and stabilization during training.
+#### **8. Multi Query Attention**
+This approach has been adopted to supercharge decoding speeds.
+#### **9. Parallelized Transformer Blocks**
+Ensuring that the model's performance is consistently high, these blocks run in tandem to provide a smooth and efficient operational experience.
+#### **10. Shifted Tokens**
+In a strategic move, Andromeda sidesteps traditional positional embeddings, relying instead on shifted tokens for sequence length progression.
+#### **11. Positional Interpolation**
+This innovative technique augments the model's ability to manage sequences more effectively.
+#### **12. Optimized CUDA Embedding Function**
+This function is tailored for peak performance, ensuring rapid and accurate computations.
+#### **13. Nebula Loss Function**
+Integrated into Andromeda, this polymorphic loss function is adept at handling multi-task training scenarios.
+## **A Word on Optimization and Future Iterations**
+As with any state-of-the-art model, Andromeda's design is an ever-evolving tapestry. This means iterative refinement. As feedback streams in and technology progresses, expect advancements in:
+- **Model Pruning**: Trimming redundancies, bolstering efficiency.
+- **Knowledge Distillation**: Harnessing the wisdom of larger models in smaller, more agile architectures.
+- **Zero-Shot and Few-Shot Learning**: Broadening adaptability horizons.
+- **Enhanced Data Augmentation**: Fortifying the model's grasp on varied, nuanced contexts.
+- **Decentralized Training**: Tapping into the global hive-mind, harnessing the collaborative power of the community.
+## **Potential Other Future Trajectories**
+#### **1. Clearer Metrics**
+There's always room to elevate the benchmarking rigor, especially concerning reasoning abilities.
+#### **2. Robust Validation and Testing Environment**
+Further fine-tuning of the testing environment can offer even more reliable validations of Andromeda's capabilities.
+#### **3. Comprehensive Documentation**
+To bolster transparency and replicability, detailed documentation covering every facet of Andromeda is on the horizon.
+#### **4. Benchmarking Against Peers**
+By juxtaposing Andromeda against its counterparts, its distinctive advantages can be spotlighted more effectively.
+#### **5. Spotlight on Real-World Applications**
+By highlighting tangible use-cases, the versatility and prowess of Andromeda can be showcased in palpable contexts.
+#### **6. Model Interpretability**
+Future iterations might delve deeper into model interpretability, especially for critical applications.
+#### **7. Niche Customizations**
+By tailoring Andromeda to meet specific niche needs, its adaptability and value proposition can be further enhanced.
+#### **8. Collaborative Endeavors**
+Engaging more intimately with the global research community could spawn collaborative projects, bringing diverse insights to the fore.
+As we voyage further into the AI frontier, Andromeda stands as a beacon, illuminating the path forward, promising marvels yet to come. It's not just about machine intelligence; it's about the dance between human curiosity and machine capability.
+---
+Join us on this journey. Dive deeper, ask questions, innovate, and let's redefine what's possible, together.

DOCs/Docs/DOCUMENTATION.md ADDED Viewed

	@@ -0,0 +1,145 @@

+# Documentation
+## `DatasetBuilder`
+### DatasetBuilder
+DatasetBuilder provides a convenient way to build datasets for training the Andromeda model.
+#### Constructor
+```python
+def __init__(
+    self,
+    dataset_name,
+    seq_len=8192,
+    num_cpu=None,
+    hf_account_repo=None,
+    tokenizer="EleutherAI/gpt-neox-20b",
+)
+```
+Initialize the DatasetBuilder.
+**Args:**
+- `dataset_name` (str): Name of the dataset to process.
+- `seq_len` (int): Maximum sequence length.
+- `num_cpu` (int, optional): Number of CPU cores to use for multiprocessing. Defaults to None.
+- `hf_account_repo` (str, optional): Hugging Face account name and repository to push the processed dataset. Defaults to None.
+- `tokenizer` (str, optional): Tokenizer model to use. Defaults to "EleutherAI/gpt-neox-20b".
+#### Methods
+##### build_dataset
+```python
+def build_dataset(self) -> torch.utils.data.Dataset
+```
+Build and process the dataset.
+**Returns:**
+- `torch.utils.data.Dataset`: The processed dataset ready for training.
+## AndromedaTokenizer
+### Purpose
+The `AndromedaTokenizer` class provides tokenization functionality using the Hugging Face tokenizer. It allows you to tokenize texts using the specified tokenizer model.
+### Systems Understanding
+The `AndromedaTokenizer` class initializes a tokenizer model from the Hugging Face library. It uses the `AutoTokenizer.from_pretrained` method to load the tokenizer model with specific parameters such as the EOS token, pad token, extra IDs, and model maximum length. The `tokenize_texts` method tokenizes input texts using the tokenizer model and returns the tokenized input IDs.
+### Usage Example
+```python
+from Andromeda import AndromedaTokenizer
+# Initialize the tokenizer
+tokenizer = AndromedaTokenizer()
+# Tokenize texts
+texts = ["This is an example sentence.", "Another example sentence."]
+tokenized_ids = tokenizer.tokenize_texts(texts)
+print(tokenized_ids)
+```
+## Andromeda
+### Purpose
+The `Andromeda` class is a transformer-based model architecture. It consists of a `Transformer` and `AutoregressiveWrapper` with default or user-specified parameters.
+### Systems Understanding
+The `Andromeda` class initializes with a `Transformer` and `AutoregressiveWrapper`. The `Transformer` encapsulates the main transformer model, and the `AutoregressiveWrapper` enables autoregressive generation using the transformer model.
+The constructor of the `Andromeda` class takes various parameters that define the architecture of the model, such as the number of tokens, maximum sequence length, model dimension, depth, number of heads, etc. These parameters are used to initialize the `Transformer` and `AutoregressiveWrapper` with the specified configuration.
+The `forward` method performs a forward pass through the model. It takes the input `text_tokens` as input and passes it through the `Decoder` module inside the `Andromeda` model. The output from the decoder is returned as the result.
+### Usage Example
+```python
+from Andromeda import Andromeda
+# Create an instance of the Andromeda model
+model = Andromeda()
+# Define the input text tokens
+text_tokens = [1, 2, 3, 4, 5]  # Example input tokens
+# Perform a forward pass through the model
+output = model.forward(text_tokens)
+print(output)
+```
+### Constructor
+```python
+def __init__(self, num_tokens=50304, max_seq_len=8192, dim=2560, depth=32, dim_head=128, heads=24, use_abs_pos_emb=False, alibi_pos_bias=True, alibi_num_heads=12, rotary_xpos=True, attn_flash=True, deepnorm=True, shift_tokens=1, attn_one_kv_head=True, qk_norm=True, attn_qk_norm=True, attn_qk_norm_dim_scale=True, embedding_provider=AndromedaEmbedding())
+```
+- `num_tokens` (optional): Number of tokens in the vocabulary.
+- `max_seq_len` (optional): Maximum sequence length.
+- `dim` (optional): Dimension of the model.
+- `depth` (optional): Depth of the model.
+- `dim_head` (optional): Dimension of the model head.
+- `heads` (optional): Number of heads.
+- `use_abs_pos_emb` (optional): Whether to use absolute position embedding.
+- `alibi_pos_bias` (optional): Alibi position bias.
+- `alibi_num_heads` (optional): Number of alibi heads.
+- `rotary_xpos` (optional): Rotary position.
+- `attn_flash` (optional): Attention flash.
+- `deepnorm` (optional): Deep normalization.
+- `shift_tokens` (optional): Number of tokens to shift.
+- `attn_one_kv_head` (optional): Attention one key/value head.
+- `qk_norm` (optional): Query-key normalization.
+- `attn_qk_norm` (optional): Attention query-key normalization.
+- `attn_qk_norm_dim_scale` (optional): Attention query-key normalization dimension scale.
+- `embedding_provider` (optional): Embedding provider module.
+### Methods
+- `forward(text_tokens, **kwargs)`: Performs a forward pass through the model.
+  - `text_tokens` (required): Input tokens.
+  - `kwargs` (optional): Other arguments.
+### Args
+- `text_tokens` (list): Input tokens.
+### Returns
+- Output from the decoder module.
+## Conclusion
+The Andromeda module provides a transformer-based model architecture for text generation. The `AndromedaTokenizer` class allows you to tokenize texts using the specified tokenizer model. The `Andromeda` class initializes with a transformer and autoregressive wrapper, providing the functionality for text generation. By using the provided classes and methods, you can generate text using the Andromeda model.

DOCs/Docs/TRAINING.md ADDED Viewed

	@@ -0,0 +1,82 @@

+# Andromeda Model Training Standard Operating Procedure
+This document provides instructions on how to train the Andromeda model end-to-end using the provided code. The training procedure consists of three main scripts: `build_dataset.py`, `model.py`, and `train_distributed.py`. Follow the steps below to train the Andromeda model.
+## Prerequisites
+Before starting the training process, ensure that you have the following requirements:
+- Python 3.7 or higher
+- PyTorch 1.9 or higher
+- Transformers library
+- Datasets library
+- Accelerate library
+- Wandb library (optional, for logging)
+## Step 1: Building the Dataset
+The first step is to build the dataset required for training. The `build_dataset.py` script processes the training data and prepares it for training. Follow the instructions below to build the dataset:
+1. Open the `build_dataset.py` script.
+2. Set the configuration parameters in the `CFG` class according to your requirements:
+   - `HF_ACCOUNT_REPO`: Replace with your Hugging Face API key.
+   - `TOKENIZER`: Choose the tokenizer model to use (e.g., "EleutherAI/gpt-neox-20b").
+   - `DATASET_NAME`: Choose the dataset to process (e.g., "tiiuae/falcon-refinedweb").
+   - `SEQ_LEN`: Set the desired sequence length.
+3. Save the changes to the script.
+4. Open a terminal or command prompt and navigate to the directory containing the `build_dataset.py` script.
+5. Run the following command to execute the script:
+   ```
+   python build_dataset.py
+   ```
+6. The script will process the dataset and push it to your Hugging Face account repository specified by `HF_ACCOUNT_REPO`.
+## Step 2: Defining the Andromeda Model
+The second step is to define the Andromeda model architecture. The `model.py` script contains the model definition and configuration. Follow the instructions below to configure the Andromeda model:
+1. Open the `model.py` script.
+2. Set the configuration parameters in the `AndromedaTokenizer` and `Andromeda` classes according to your requirements:
+   - `tokenizer`: Configure the tokenizer with the desired parameters.
+   - `Andromeda`: Configure the Andromeda model with the desired architecture.
+3. Save the changes to the script.
+## Step 3: Training the Andromeda Model
+The final step is to train the Andromeda model using the `train_distributed.py` script. Follow the instructions below to start the training process:
+1. Open the `train_distributed.py` script.
+2. Set the configuration parameters in the `TrainAndromeda.CFG` class according to your requirements:
+   - `BATCH_SIZE`: Set the batch size for training.
+   - `GRADIENT_ACCUMULATE_EVERY`: Set the number of gradient accumulation steps.
+   - `LEARNING_RATE`: Set the learning rate for the optimizer.
+   - `WEIGHT_DECAY`: Set the weight decay for the optimizer.
+   - `SEQ_LEN`: Set the desired sequence length.
+   - `USE_DEEPSPEED`: Set to `True` if using DeepSpeed for optimization.
+   - `USE_FSDP`: Set to `True` if using Fully Sharded Data Parallelism.
+   - `USE_PRETOKENIZED`: Set to `True` if using a pre-tokenized dataset.
+   - `USE_ACTIVATION_CHECKPOINTING`: Set to `True` if using activation checkpointing.
+   - `RESUME_FROM_CHECKPOINT`: Set to the path of a checkpoint to resume training from.
+   - `CHECKPOINTING_STEPS`: Set the number of steps between checkpoints.
+   - `OUTPUT_DIR`: Set the output directory for saving the model checkpoints and logs.
+   - `ENTITY_NAME`: Set the Wandb entity name for logging (optional).
+3. Save the changes to the script.
+4. Open a terminal or command prompt and navigate to the directory containing the `train_distributed.py` script.
+5. Run the following command to start the training:
+   ```
+   python train_distributed.py
+   ```
+6. The script will train the Andromeda model using the specified configuration and dataset.
+7. During training, the progress will be displayed in the terminal, and logs will be saved to the specified output directory.
+# Other Training methods
+First:
+`Accelerate Config`
+Enable Deepspeed 3:
+`Accelerate launch train_distributed_accelerate.py`

DOCs/Docs/Training/DATASET_STRATEGY.md ADDED Viewed

	@@ -0,0 +1,100 @@

+#  Andromeda
+We should train an 100m param, 500m, 1billion parameters verisions with similiar hyperparameters from these 2 similiar models
+[concept of mind's PALM](https://github.com/conceptofmind/PaLM)
+Model Size	Num Tokens	Dim	Depth	Dim Head	Heads	Flash Attention	Learning Rate
+150 M	50304	768	12	128	8	True	6e-4
+410 M	50304	1024	24	128	8	True	3e-4
+1 B	50304	2048	16	128	8	True	3e-4
+[MPT HF](https://huggingface.co/mosaicml/mpt-7b)
+Hyperparameter	Value
+n_parameters	6.7B
+n_layers	32
+n_heads	32
+d_model	4096
+vocab size	50432
+sequence length	2048
+## Data prioritization: Prioritize datasets based on their relevance to the desired AI capabilities and the quality of the data.
+High priority: C4, openwebtext, super_glue, piqa, Falcon-40B (RefinedWeb-English, RefinedWeb-Europe, Books, Conversations, Code, Technical), glue, tiiuae/falcon-refinedweb, math_dataset
+Medium priority:  bigcode/ta-prompt, bigcode/the-stack-dedup, OpenAssistant/oasst1, ehartford/wizard_vicuna_70k_unfiltered, tiiuae/falcon-refinedweb
+Low priority: timdettmers/openassistant-guanaco, JosephusCheung/GuanacoDataset,  JosephusCheung/GuanacoDataset, anon8231489123/ShareGPT_Vicuna_unfiltered, togethercomputer/RedPajama-Data, togethercomputer/RedPajama-Data-1T, Anthropic/hh-rlhf, databricks/databricks-dolly-15k, QingyiSi/Alpaca-CoT, alpaca,
+distillation, timdettmers/openassistant-guanaco, OpenAssistant/oasst1, dmayhem93/toolformer-v0-postprocessed, openai_humaneval, yahma/alpaca-cleaned,
+## Data preprocessing: Clean, preprocess, and tokenize the datasets to ensure consistency and compatibility with the AI model.
+Remove duplicates, irrelevant content, and low-quality data.
+Tokenize the text using a suitable tokenizer, such as GPT Neox tokenizer or potentially falcon's tokenizer
+Split the datasets into training, validation, and testing sets.
+## Training strategy: Train the AI model using the prioritized datasets in a multi-stage process.
+Stage 1: Pretrain the model on high-priority datasets (openwebtext, super_glue, piqa, Falcon-40B, glue) to build a strong language understanding foundation.
+Stage 2: Fine-tune the model on medium-priority datasets (bigcode/ta-prompt, bigcode/the-stack-dedup, OpenAssistant/oasst1, ehartford/wizard_vicuna_70k_unfiltered, tiiuae/falcon-refinedweb) to enhance its performance in specific domains and tasks.
+Stage 3: Further fine-tune the model on low-priority datasets (JosephusCheung/GuanacoDataset, anon8231489123/ShareGPT_Vicuna_unfiltered, togethercomputer/RedPajama-Data, togethercomputer/RedPajama-Data-1T, Anthropic/hh-rlhf, databricks/databricks-dolly-15k, QingyiSi/Alpaca-CoT) to capture any additional knowledge and nuances. PRM800K: A Process Supervision Dataset
+Evaluation and iteration: Continuously evaluate the model's performance on the validation and testing sets, and iterate the training process to improve its performance.
+Monitor the model's performance using relevant metrics, such as perplexity, F1 score, or BLEU score, depending on the task.
+Adjust hyperparameters, learning rate, and training duration as needed to optimize the model's performance.
+If necessary, revisit the data prioritization and preprocessing steps to refine the training data.
+# Evaluations and Benchmarks:
+[Chain of thought hub](https://github.com/FranxYao/chain-of-thought-hub)
+SFT stands for Style Fine-tuning and RLHF stands for Reinforcement Learning and Human Feedback. These are techniques used in natural language processing to improve the quality and accuracy of generated text. The statement suggests that if these techniques are applied correctly to the 65B LLaMA dataset, it is possible to recreate ChatGPT.
+# Analysis of Existing Models
+### MPT-7B
+```python
+Data Source	Number of Tokens in Source	Proportion	Effective Number of Tokens	Epochs
+mC4 3.1.0 - English	417.99 B	0.33	330 B	0.14
+C4 - English - SemDedup 80%	100.42 B	0.299	299 B	2.98
+RedPajama - CommonCrawl	878.45 B	0.1	100 B	0.11
+The Stack - Selected Languages	463.78 B	0.1	100 B	0.22
+RedPajama - Wikipedia - En	4.87 B	0.04	40 B	8.21
+The Stack - Markdown	107.07 B	0.035	35 B	0.33
+S2ORC	48.85 B	0.033	33 B	0.68
+RedPajama - Books	26.02 B	0.03	30B	1.15
+RedPajama - arXiv	28.10 B	0.019	19 B	0.68
+RedPajama - StackExchange	20.54 B	0.014	14 B	0.68
+```
+# MPT-1B
+```
+Training Data
+The model was trained for 200B tokens (batch size 2200, sequence length 2048). It was trained on the following data mix:
+67% RedPajama Common Crawl
+15% C4
+4.5% RedPajama GitHub
+4.5% RedPajama Wikipedia
+4.5% RedPajama Books
+2.5% RedPajama Arxiv
+2% RedPajama StackExchange
+Each sample was chosen from one of the datasets, with the dataset selected with the probability specified above. The examples were shuffled within each dataset. Each example was constructed from as many sequences from that dataset as were necessary to fill the 2048 sequence length.
+```

DOCs/Tests/BENCHMARKING.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# Andromeda Performance Benchmarking Analysis: Pre-Training Metrics
+Before initiating the pre-training phase, we need to ensure that every component of our model – the Andromeda, is performing as expected. To do this, we'll create an extensive suite of metrics to monitor and evaluate. This will allow us to identify any bottlenecks, inefficiencies, or errors, and optimize the model accordingly.
+## Component-wise Metrics
+We focus on the transformer layer and the attention mechanism, key components of Andromeda, to extract meaningful metrics.
+### Transformer Layer Metrics
+1. **Number of Parameters**: The total number of parameters in the transformer layer. More parameters can lead to a more powerful model but also increase the risk of overfitting and the computational load.
+2. **Layer-wise Activation Statistics**: For each layer in the transformer, calculate statistics such as mean, standard deviation, min, and max of the activations.
+3. **Layer-wise Gradient Statistics**: Similarly, calculate statistics for the gradients flowing through each layer. Look for any layer where the gradients are consistently close to zero, as this could indicate that the layer isn't learning effectively.
+4. **Feed-forward Network (FFN) Activation Statistics**: Calculate activation statistics specifically for the feed-forward networks in the transformer layer.
+5. **FFN Gradient Statistics**: Similarly, calculate gradient statistics for the FFNs.
+### Attention Mechanism Metrics
+1. **Self-Attention Distribution**: Plot the distribution of attention weights. This can help identify if the model is paying attention to the right inputs.
+2. **Multi-Head Attention Distribution**: For multi-head attention, plot the distribution of attention weights for each head.
+3. **Attention Entropy**: Calculate the entropy of the attention distribution. A higher entropy can indicate that the model is distributing its attention more evenly, while a lower entropy can indicate that it's focusing on a smaller number of inputs.
+4. **Self-Attention Gradient Statistics**: Calculate statistics for the gradients flowing through the self-attention mechanism.
+5. **Multi-Head Attention Gradient Statistics**: Similarly, calculate gradient statistics for the multi-head attention mechanism.
+6. **Number of Heads Paying Attention**: Count the number of heads that are paying significant attention (i.e., have a high average attention weight) to understand the model's attention spread.
+## Test Suite Execution
+These metrics should be calculated for a range of input examples to ensure the model performs well across different situations. To do this, we create a test suite.
+The test suite should include:
+1. **Various Input Lengths**: Test inputs of varying lengths to ensure the model performs well regardless of input size.
+2. **Different Data Modalities**: If the model is designed to handle different data types (text, images, etc.), these should be included in the test suite.
+3. **Varied Content**: Include a range of different content in the inputs to test how well the model handles different topics or styles.
+4. **Out-of-Distribution Data**: Include some data that's not from the training distribution to see how the model handles unexpected inputs.
+5. **Noise**: Include inputs with added noise to test the model's robustness.
+Remember, the goal here is not just to have a laundry list of metrics but to understand what each metric tells us about the model's performance and use this information to optimize the model. This extreme attention to detail will ensure Andromeda's high performance and broad applicability.
+# Speed and Scalability Metrics
+While model performance is crucial, it isn't the only factor that determines the success of a system. We must also consider its speed, scalability, and context limits.
+### Speed Metrics
+1. **Model Inference Time**: Measure the average time it takes for the model to make predictions for a set of inputs. This can be done using methods like `time.perf_counter()` in Python.
+2. **Batch Processing Time**: The time taken to process a batch of inputs can provide an insight into the model's speed at scale. This is especially important when processing large datasets.
+3. **Forward Pass Time**: Record the time taken for a forward pass through the network.
+4. **Backward Pass Time**: Measure the time taken for the backward pass, especially if the model will be fine-tuned or trained further.
+5. **End-to-End Latency**: This measures the total time taken from the moment the input is provided to the model till the output is produced. This includes preprocessing, inference, and postprocessing times.
+### Scalability Metrics
+1. **Throughput**: Evaluate the number of inputs the model can process per unit of time.
+2. **Memory Footprint**: Analyze the memory usage of the model during inference. Large models may require significant memory resources, especially during training.
+3. **Parallel Processing Performance**: If the model is designed to run on multiple GPUs or across multiple machines, measure its performance in these settings.
+4. **Load Balancing**: Measure how well the model can distribute computational load across multiple GPUs or nodes.
+### Context Limits Metrics
+1. **Sequence Length Impact**: Evaluate how the model's performance changes with varying sequence lengths. Some models struggle with very short or very long sequences.
+2. **Robustness to Input Variation**: Test the model with a variety of inputs, such as out-of-vocabulary words, uncommon syntax, etc., to understand its ability to handle diverse inputs.
+3. **Contextual Ambiguity**: Measure the model's ability to handle ambiguous inputs where context is crucial for understanding.
+4. **Sensitivity to Input Changes**: Evaluate how much the model's output changes when small modifications are made to the input. If the model is too sensitive, it might overreact to minor changes.
+Each of these metrics should be calculated across a range of situations to understand the model's behavior under different conditions. This exhaustive testing will allow us to optimize Andromeda for the best balance of speed, scalability, and context limits.
+# Key Metrics
+1. **Perplexity:** This is a common metric for assessing language models, which measures how well the model predicts a sample. Lower perplexity indicates better performance. However, it's worth noting that while perplexity is a useful indicator, it doesn't capture everything, especially for creative tasks like language generation.
+2. **Validation Loss:** While perplexity is great, you also want to track your validation loss directly. This is your primary optimization target and often gives the most actionable insights.
+3. **Speed Metrics:** This includes forward pass time, backward pass time, and end-to-end latency. Ensuring that your model operates quickly is crucial for scalability and user experience. The lower these metrics, the better.
+4. **Throughput:** Measures the number of instances your model can process per second. Higher throughput indicates a more efficient model.
+5. **Memory Footprint:** You need to measure the amount of memory your model uses during inference and training. This is especially important for larger models, as it could limit scalability. Lower memory usage is better.
+6. **Sequence Length Impact:** How does your model's performance change with the length of the input sequence? This is critical for understanding its applicability to real-world scenarios where sequence lengths can vary widely.
+7. **Parameter Efficiency:** How well does your model make use of its parameters? This is typically measured as performance relative to the number of parameters. More efficient use of parameters is better.
+8. **Accuracy on Benchmarked Datasets:** For instance, GLUE or SuperGLUE benchmark scores for natural language understanding tasks, or SQuAD for question answering. Higher scores on these benchmarks indicate better performance.
+9. **Consistency over Time:** Does the model's performance degrade or remain consistent over multiple, identical runs? If performance varies greatly, the model may be unstable.
+10. **Robustness to Noise:** How well does your model handle noise in the input data? This can be simulated by adding random noise to your validation data and measuring the model's performance.
+11. **Fairness and Bias:** Test the model on a variety of fairness metrics to ensure it treats all users and inputs equally. This can be complex and requires a dataset that is diverse and representative.
+Remember that these metrics will vary depending on your specific use case and model. For example, a translation model may need to prioritize sequence length performance, while a chatbot may need to emphasize throughput and latency.
+Also, be aware that these are just some of the metrics you could test before pre-training. The exact list will depend on your specific use case and requirements.

FailureAnalysis/CPU_MEMORY.md ADDED Viewed

	@@ -0,0 +1,489 @@

+# July 32, 9:12pm
+* Failure to train perhaps to due to not having enough CPU memory
+## Sources
+* [torch.distributed.elastic.multiprocessing.errors.ChildFailedError](https://discuss.huggingface.co/t/torch-distributed-elastic-multiprocessing-errors-childfailederror/28242)
+* `export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO``
+```
+Hey guys, I’m glad to announce I solved the issue on my side.
+As can be seen I use multiple GPUs, which have sufficient memory for the use case.
+HOWEVER! My issue was due to not enough CPU memory. That’s why my runs crashed and without any trace of the reason.
+Once I allocated enough cpu (on my case I increased it from 32GB to 96+ GB).
+If the CPU allocation is constant and you can not allocated more, I’m sure you can try solutions as compressed models, deepspeed optimization levels and more.
+Good luck to future readers.
+```
+### Root cause:
+* Not having enough cpu memory,
+# Solutions:
+* perhaps move everything into nvme or offload the parameters to the cpu using deepspeed
+## Log
+```
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:04:13,441] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[2023-08-01 01:04:16,624] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,634] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,641] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,669] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,712] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:04:16,720] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208581 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208582 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208583 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208584 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 208586 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 208585) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:06:47
+  host      : r1n2a6000bittensor
+  rank      : 4 (local_rank: 4)
+  exitcode  : -9 (pid: 208585)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 208585
+=======================================================
+commune@r1n2a6000bittensor:~/Andromeda$ export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:09:31,113] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:46392.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:46406.
+[2023-08-01 01:09:34,414] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:09:34,417] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:09:34,477] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:09:34,541] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:09:34,614] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:09:34,642] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209014 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209015 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209016 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209018 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209019 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 3 (pid: 209017) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:11:46
+  host      : r1n2a6000bittensor
+  rank      : 3 (local_rank: 3)
+  exitcode  : -9 (pid: 209017)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 209017
+=======================================================
+commune@r1n2a6000bittensor:~/Andromeda$
+```
+------
+----
+# Log2
+* I reconfigurd the setting to utilize torch dynamo and offload parameters to nvme
+```
+ commune@r1n2a6000bittensor:~/Andromeda$ accelerate config
+[2023-08-01 01:15:17,803] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+----------------------------------------------------------------------------------------------------------In which compute environment are you running?
+This machine
+----------------------------------------------------------------------------------------------------------Which type of machine are you using?
+multi-GPU
+How many different machines will you use (use more than 1 for multi-node training)? [1]:
+Do you wish to optimize your script with torch dynamo?[yes/NO]:yes
+----------------------------------------------------------------------------------------------------------Which dynamo backend would you like to use?
+nvfuser
+Do you want to customize the defaults sent to torch.compile? [yes/NO]:
+Do you want to use DeepSpeed? [yes/NO]: yes
+Do you want to specify a json file to a DeepSpeed config? [yes/NO]: no
+----------------------------------------------------------------------------------------------------------What should be your DeepSpeed's ZeRO optimization stage?
+3
+----------------------------------------------------------------------------------------------------------Where to offload optimizer states?
+nvme
+----------------------------------------------------------------------------------------------------------Where to offload parameters?
+nvme
+Nvme Path to offload parameters?
+Nvme Path to offload optimizer states?
+How many gradient accumulation steps you're passing in your script? [1]:
+Do you want to use gradient clipping? [yes/NO]: yes
+What is the gradient clipping value? [1.0]:
+Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: yes
+Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: yes
+How many GPU(s) should be used for distributed training? [1]:6
+----------------------------------------------------------------------------------------------------------Do you wish to use FP16 or BF16 (mixed precision)?
+fp8
+accelerate configuration saved at /home/commune/.cache/huggingface/accelerate/default_config.yaml
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:15:58,494] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45830.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45838.
+[2023-08-01 01:16:01,364] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:16:01,455] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:16:01,456] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:16:01,484] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:16:01,555] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:16:01,593] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209602 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209603 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209604 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209605 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 209606 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 0 (pid: 209601) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:18:29
+  host      : r1n2a6000bittensor
+  rank      : 0 (local_rank: 0)
+  exitcode  : -9 (pid: 209601)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 209601
+=======================================================
+```
+# Log3
+* I changed the config to use deepspeed1, same error
+```
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate config
+[2023-08-01 01:21:26,715] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+-----------------------------------------------------------------------------------------------------------------------------------In which compute environment are you running?
+This machine
+-----------------------------------------------------------------------------------------------------------------------------------Which type of machine are you using?
+multi-GPU
+How many different machines will you use (use more than 1 for multi-node training)? [1]:
+Do you wish to optimize your script with torch dynamo?[yes/NO]:no
+Do you want to use DeepSpeed? [yes/NO]: yes
+Do you want to specify a json file to a DeepSpeed config? [yes/NO]: no
+-----------------------------------------------------------------------------------------------------------------------------------What should be your DeepSpeed's ZeRO optimization stage?
+1
+How many gradient accumulation steps you're passing in your script? [1]:
+Do you want to use gradient clipping? [yes/NO]: no
+Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: yes
+How many GPU(s) should be used for distributed training? [1]:6
+-----------------------------------------------------------------------------------------------------------------------------------Do you wish to use FP16 or BF16 (mixed precision)?
+fp8
+accelerate configuration saved at /home/commune/.cache/huggingface/accelerate/default_config.yaml
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:21:50,336] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+WARNING:torch.distributed.run:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+*****************************************
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:57524.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:57530.
+[2023-08-01 01:21:53,173] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,189] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,237] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[2023-08-01 01:21:53,367] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,439] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-08-01 01:21:53,452] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210195 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210197 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210198 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210199 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 210200 closing signal SIGTERM
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 1 (pid: 210196) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
+    deepspeed_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+=======================================================
+train.py FAILED
+-------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+-------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:24:23
+  host      : r1n2a6000bittensor
+  rank      : 1 (local_rank: 1)
+  exitcode  : -9 (pid: 210196)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 210196
+=======================================================
+commune@r1n2a6000bittensor:~/Andromeda$
+```
+# Log3
+* No deepspeed at all but rather fullyshardeddataparallel with shardgradop,transformerbasedwrap,
+sharded_state_dict,
+```
+ommune@r1n2a6000bittensor:~/Andromeda$ accelerate config
+[2023-08-01 01:25:09,849] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+-----------------------------------------------------------------------------------------------------------------------------------In which compute environment are you running?
+This machine
+-----------------------------------------------------------------------------------------------------------------------------------Which type of machine are you using?
+multi-GPU
+How many different machines will you use (use more than 1 for multi-node training)? [1]:
+Do you wish to optimize your script with torch dynamo?[yes/NO]:
+Do you want to use DeepSpeed? [yes/NO]:
+Do you want to use FullyShardedDataParallel? [yes/NO]: yes
+-----------------------------------------------------------------------------------------------------------------------------------What should be your sharding strategy?
+SHARD_GRAD_OP
+Do you want to offload parameters and gradients to CPU? [yes/NO]: yes
+-----------------------------------------------------------------------------------------------------------------------------------What should be your auto wrap policy?
+TRANSFORMER_BASED_WRAP
+Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? :
+-----------------------------------------------------------------------------------------------------------------------------------What should be your FSDP's backward prefetch policy?
+BACKWARD_PRE
+-----------------------------------------------------------------------------------------------------------------------------------What should be your FSDP's state dict type?
+SHARDED_STATE_DICT
+Do you want to enable FSDP's forward prefetch policy? [yes/NO]: yes
+Do you want to enable FSDP's `use_orig_params` feature? [yes/NO]: yes
+Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [yes/NO]:
+How many GPU(s) should be used for distributed training? [1]:
+-----------------------------------------------------------------------------------------------------------------------------------Do you wish to use FP16 or BF16 (mixed precision)?
+fp8
+accelerate configuration saved at /home/commune/.cache/huggingface/accelerate/default_config.yaml
+commune@r1n2a6000bittensor:~/Andromeda$ accelerate launch train.py
+[2023-08-01 01:25:47,200] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+[I socket.cpp:566] [c10d] The server socket has started to listen on [::]:29500.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:47910.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:47916.
+[2023-08-01 01:25:49,991] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.4) or chardet (4.0.0) doesn't match a supported version!
+  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
+Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45082.
+[I socket.cpp:787] [c10d] The client socket has connected to [localhost]:29500 on [localhost]:45084.
+[I ProcessGroupNCCL.cpp:665] [Rank 0] ProcessGroupNCCL initialized with following options:
+NCCL_ASYNC_ERROR_HANDLING: 1
+NCCL_DESYNC_DEBUG: 0
+NCCL_BLOCKING_WAIT: 0
+TIMEOUT(ms): 1800000
+USE_HIGH_PRIORITY_STREAM: 0
+[I ProcessGroupNCCL.cpp:842] [Rank 0] NCCL watchdog thread started!
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/train.py", line 705, in <module>
+    main()
+  File "/home/commune/Andromeda/train.py", line 702, in main
+    Train()
+  File "/home/commune/Andromeda/train.py", line 484, in Train
+    state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = CFG.BATCH_SIZE #??????
+AttributeError: 'NoneType' object has no attribute 'deepspeed_config'
+[I ProcessGroupNCCL.cpp:844] [Rank 0] NCCL watchdog thread terminated normally
+ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 210780) of binary: /usr/bin/python3.10
+Traceback (most recent call last):
+  File "/home/commune/.local/bin/accelerate", line 8, in <module>
+    sys.exit(main())
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
+    args.func(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 966, in launch_command
+    multi_gpu_launcher(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
+    distrib_run.run(args)
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
+    elastic_launch(
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/commune/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+train.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2023-08-01_01:29:53
+  host      : r1n2a6000bittensor
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 210780)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+commune@r1n2a6000bittensor:~/Andromeda$
+```

FailureAnalysis/OptimizerDict.md ADDED Viewed

	@@ -0,0 +1,238 @@

+ata: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:03<00:00, 46.3MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:04<00:00, 37.2MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:03<00:00, 47.5MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:03<00:00, 46.0MB/s]
+Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 180M/180M [00:04<00:00, 41.2MB/s]
+Downloading data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1Downloading data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [27:46<00:00, 1666.10s/it]
+Extracting data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.85it/s]
+Dataset parquet downloaded and prepared to /home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/home/commune/.cache/huggingface/datasets/conceptofmind___parquet/conceptofmind--c4_0-to-20_neox_with_eos_8k-dd8655ce54e7b6cc/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+[2023-07-24 15:58:13,787] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,787] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,787] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,787] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,787] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,789] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,790] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,790] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,790] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,790] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,790] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,790] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,790] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,790] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:13,791] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
+[2023-07-24 15:58:13,792] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
+[2023-07-24 15:58:13,792] [INFO] [comm.py:594:init_distributed] cdb=None
+[2023-07-24 15:58:17,032] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2023-07-24 15:58:17,035] [INFO] [logging.py:96:log_dist] [Rank 0] Creating ZeRO Offload
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+[2023-07-24 15:58:17,268] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
+[2023-07-24 15:58:17,268] [INFO] [utils.py:786:see_memory_usage] MA 0.68 GB         Max_MA 0.68 GB         CA 0.69 GB         Max_CA 1 GB
+[2023-07-24 15:58:17,268] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 18.35 GB, percent = 3.6%
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Parameter Offload: Total persistent parameters: 108032 in 490 params
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+[2023-07-24 15:58:17,449] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
+[2023-07-24 15:58:17,450] [INFO] [utils.py:786:see_memory_usage] MA 0.8 GB         Max_MA 0.8 GB         CA 0.8 GB         Max_CA 1 GB
+[2023-07-24 15:58:17,450] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 18.39 GB, percent = 3.7%
+[2023-07-24 15:58:17,451] [INFO] [config.py:960:print] DeepSpeedEngine configuration:
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   activation_checkpointing_config  {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false,
+    "cpu_checkpointing": false,
+    "number_checkpoints": null,
+    "synchronize_checkpoint_boundary": false,
+    "profile": false
+}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   amp_enabled .................. False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   amp_params ................... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   autotuning_config ............ {
+    "enabled": false,
+    "start_step": null,
+    "end_step": null,
+    "metric_path": null,
+    "arg_mappings": null,
+    "metric": "throughput",
+    "model_info": null,
+    "results_dir": "autotuning_results",
+    "exps_dir": "autotuning_exps",
+    "overwrite": true,
+    "fast": true,
+    "start_profile_step": 3,
+    "end_profile_step": 5,
+    "tuner_type": "gridsearch",
+    "tuner_early_stopping": 5,
+    "tuner_num_trials": 50,
+    "model_info_path": null,
+    "mp_size": 1,
+    "max_train_batch_size": null,
+    "min_train_batch_size": 1,
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+    "min_train_micro_batch_size_per_gpu": 1,
+    "num_tuning_micro_batch_sizes": 3
+}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   bfloat16_enabled ............. False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   checkpoint_parallel_write_pipeline  False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   checkpoint_tag_validation_enabled  True
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   checkpoint_tag_validation_fail  False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f06c428a950>
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   communication_data_type ...... None
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   curriculum_enabled_legacy .... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   curriculum_params_legacy ..... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   data_efficiency_enabled ...... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   dataloader_drop_last ......... False
+[2023-07-24 15:58:17,451] [INFO] [config.py:964:print]   disable_allgather ............ False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   dump_state ................... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   dynamic_loss_scale_args ...... None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_enabled ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_gas_boundary_resolution  1
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_layer_num ......... 0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_max_iter .......... 100
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_stability ......... 1e-06
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_tol ............... 0.01
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   eigenvalue_verbose ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   elasticity_enabled ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   flops_profiler_config ........ {
+    "enabled": false,
+    "recompute_fwd_factor": 0.0,
+    "profile_step": 1,
+    "module_depth": -1,
+    "top_modules": 1,
+    "detailed": true,
+    "output_file": null
+}
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   fp16_auto_cast ............... True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   fp16_enabled ................. True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   fp16_master_weights_and_gradients  False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   global_rank .................. 0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   grad_accum_dtype ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   gradient_accumulation_steps .. 1
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   gradient_clipping ............ 0.0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   gradient_predivide_factor .... 1.0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   initial_dynamic_scale ........ 65536
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   load_universal_checkpoint .... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   loss_scale ................... 0
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   memory_breakdown ............. False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   mics_hierarchial_params_gather  False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   mics_shard_size .............. -1
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   nebula_config ................ {
+    "enabled": false,
+    "persistent_storage_path": null,
+    "persistent_time_interval": 100,
+    "num_of_version_in_retention": 2,
+    "enable_nebula_load": true,
+    "load_path": null
+}
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   optimizer_legacy_fusion ...... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   optimizer_name ............... None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   optimizer_params ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   pld_enabled .................. False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   pld_params ................... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   prescale_gradients ........... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   scheduler_name ............... None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   scheduler_params ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   sparse_attention ............. None
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   sparse_gradients_enabled ..... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   steps_per_print .............. inf
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   train_batch_size ............. 18
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   train_micro_batch_size_per_gpu  3
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   use_node_local_storage ....... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   wall_clock_breakdown ......... False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   world_size ................... 6
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_allow_untested_optimizer  False
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_enabled ................. True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_force_ds_cpu_optimizer .. True
+[2023-07-24 15:58:17,452] [INFO] [config.py:964:print]   zero_optimization_stage ...... 3
+[2023-07-24 15:58:17,453] [INFO] [config.py:950:print_user_config]   json = {
+    "train_batch_size": 18,
+    "train_micro_batch_size_per_gpu": 3,
+    "gradient_accumulation_steps": 1,
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "nvme_path": null
+        },
+        "offload_param": {
+            "device": "none",
+            "nvme_path": null
+        },
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "steps_per_print": inf,
+    "fp16": {
+        "enabled": true,
+        "auto_cast": true
+    },
+    "bf16": {
+        "enabled": false
+    }
+}
+Using stable_adamw optimizer
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'
+Traceback (most recent call last):
+  File "/home/commune/Andromeda/Andromeda/train.py", line 667, in <module>
+  File "/home/commune/Andromeda/Andromeda/train.py", line 664, in main
+  File "/home/commune/Andromeda/Andromeda/train.py", line 519, in Train
+    beta_2=0.95,
+  File "/home/commune/Andromeda/Andromeda/train.py", line 294, in decoupled_optimizer
+    # Create an empty list to store the names of the LayerNorm and Embedding layer weights with no weight decay.
+AttributeError: 'tuple' object has no attribute 'named_parameters'