LivePortrait

Running on Zero

App Files Files Community

yerang commited on Oct 15, 2024

Commit

e3af00f

verified ·

1 Parent(s): cc89c8b

Upload 1110 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

stf/.DS_Store +0 -0
stf/089.npz +3 -0
stf/089.pth +3 -0
stf/stf-api-alternative/.gitignore +160 -0
stf/stf-api-alternative/.ipynb_checkpoints/README-checkpoint.md +1 -0
stf/stf-api-alternative/.ipynb_checkpoints/poetry-checkpoint.lock +0 -0
stf/stf-api-alternative/.ipynb_checkpoints/pyproject-checkpoint.toml +35 -0
stf/stf-api-alternative/README.md +1 -0
stf/stf-api-alternative/poetry.lock +0 -0
stf/stf-api-alternative/pyproject.toml +35 -0
stf/stf-api-alternative/pytriton/.flake8 +19 -0
stf/stf-api-alternative/pytriton/.github/ISSUE_TEMPLATE/bug_report.md +83 -0
stf/stf-api-alternative/pytriton/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
stf/stf-api-alternative/pytriton/.github/workflows/stale.yaml +35 -0
stf/stf-api-alternative/pytriton/.gitignore +330 -0
stf/stf-api-alternative/pytriton/.pre-commit-config.yaml +76 -0
stf/stf-api-alternative/pytriton/CHANGELOG.md +239 -0
stf/stf-api-alternative/pytriton/CONTRIBUTING.md +203 -0
stf/stf-api-alternative/pytriton/COPYRIGHT +13 -0
stf/stf-api-alternative/pytriton/LICENSE +174 -0
stf/stf-api-alternative/pytriton/Makefile +124 -0
stf/stf-api-alternative/pytriton/README.md +343 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/__init__.py +27 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/__main__.py +218 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/check/__init__.py +14 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/check/add_sub.py +139 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/check/env_checks.py +201 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/check/utils.py +555 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/client/__init__.py +22 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/client/asyncio_utils.py +308 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/client/client.py +2033 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/client/exceptions.py +92 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/client/utils.py +384 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/client/warnings.py +26 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/constants.py +31 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/decorators.py +678 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/exceptions.py +80 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/__init__.py +17 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/common.py +93 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/generator.py +284 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/model_config.py +43 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/parser.py +258 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/tensor.py +57 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/triton_model_config.py +68 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/models/__init__.py +14 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/models/manager.py +147 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/models/model.py +335 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/proxy/__init__.py +14 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/proxy/communication.py +555 -0
stf/stf-api-alternative/pytriton/build/lib/pytriton/proxy/data.py +1133 -0

stf/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

stf/089.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ce3fb07d8d15495eab879b47413c6b86bce114ca9ecd375b45b54777cf0e175
+size 522605028

stf/089.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba4eb3437019d77abed141d60bcb5489b664f494cf965eec0bccf304c3d79b2a
+size 1567401123

stf/stf-api-alternative/.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

stf/stf-api-alternative/.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ stf_api와 동일한 기능을 수행하는 라이브러리

stf/stf-api-alternative/.ipynb_checkpoints/poetry-checkpoint.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

stf/stf-api-alternative/.ipynb_checkpoints/pyproject-checkpoint.toml ADDED Viewed

	@@ -0,0 +1,35 @@

+[tool.poetry]
+name = "stf-alternative"
+version = "0.1.0"
+description = "alternative version of stf-api"
+authors = ["Kim Minjong <make.dirty.code@gmail.com>"]
+readme = "README.md"
+packages = [
+  {include = "stf_alternative", from="src"}
+]
+[tool.poetry.dependencies]
+python = "^3.10"
+librosa = "0.8.1"
+imageio = "2.13.5"
+imageio-ffmpeg = "0.4.5"
+Pillow = "9.1.0"
+tqdm = "4.64.0"
+numpy = "1.22.4"
+addict = "2.4.0"
+scipy = "1.12.0"
+pandas = "1.3.5"
+face_alignment = "1.3.5"
+moviepy = "1.0.3"
+transformers = "4.29.2"
+facenet_pytorch = "2.5.2"
+ffmpeg-python = "^0.2"
+pydub = "^0.25"
+av = "^11.0.0"
+nvidia-pytriton = {extras = ["client"], version = "^0.4.2"}
+asyncstdlib = "^3.10.9"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

stf/stf-api-alternative/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ stf_api와 동일한 기능을 수행하는 라이브러리

stf/stf-api-alternative/poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

stf/stf-api-alternative/pyproject.toml ADDED Viewed

	@@ -0,0 +1,35 @@

+[tool.poetry]
+name = "stf-alternative"
+version = "0.1.0"
+description = "alternative version of stf-api"
+authors = ["Kim Minjong <make.dirty.code@gmail.com>"]
+readme = "README.md"
+packages = [
+  {include = "stf_alternative", from="src"}
+]
+[tool.poetry.dependencies]
+python = "^3.10"
+librosa = "0.8.1"
+imageio = "2.13.5"
+imageio-ffmpeg = "0.4.5"
+Pillow = "9.1.0"
+tqdm = "4.64.0"
+numpy = "1.24.4"
+addict = "2.4.0"
+scipy = "1.12.0"
+pandas = "1.3.5"
+face_alignment = "1.3.5"
+moviepy = "1.0.3"
+transformers = "4.29.2"
+facenet_pytorch = "2.5.2"
+ffmpeg-python = "^0.2"
+pydub = "^0.25"
+av = "^11.0.0"
+nvidia-pytriton = {extras = ["client"], version = "^0.4.2"}
+asyncstdlib = "^3.10.9"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

stf/stf-api-alternative/pytriton/.flake8 ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+[flake8]
+exclude = docs,experiments,blueprints,pytriton/tritonserver,sandbox
+ignore = E203, E266, E501, W503
+max-line-length = 120
+max-complexity = 18
+select = B,C,D,E,F,W,T,N

stf/stf-api-alternative/pytriton/.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,83 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+---
+**Description**
+A clear and concise description of the bug.
+**To reproduce**
+If relevant, add a minimal example so that we can reproduce the error, if necessary, by running the code. For example:
+```python
+# server
+from pytriton.decorators import batch
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton
+@batch
+def _infer_fn(**inputs):
+    ...
+    results_dict = model(**inputs)  # ex note: the bug is here, we expect to receive ...
+    ...
+    # note: observing results_dict as dictionary of numpy arrays
+    return results_dict
+with Triton() as triton:
+    triton.bind(
+        model_name="MyModel",
+        infer_func=_infer_fn,
+        inputs=[
+            Tensor(name="in1", dtype=np.float32, shape=(-1,)),
+            Tensor(name="in2", dtype=np.float32, shape=(-1,)),
+        ],
+        outputs=[
+            Tensor(name="out1", dtype=np.float32, shape=(-1,)),
+            Tensor(name="out2", dtype=np.float32, shape=(-1,)),
+        ],
+        config=ModelConfig(max_batch_size=128),
+    )
+    triton.serve()
+```
+```python
+# client
+import numpy as np
+from pytriton.client import ModelClient
+batch_size = 2
+in1_batch = np.ones((batch_size, 1), dtype=np.float32)
+in2_batch = np.ones((batch_size, 1), dtype=np.float32)
+with ModelClient("localhost", "MyModel") as client:
+    result_batch = client.infer_batch(in1_batch, in2_batch)
+```
+**Observed results and expected behavior**
+Please describe the observed results as well as the expected results.
+If possible, attach relevant log output to help analyze your problem.
+If an error is raised, please paste the full traceback of the exception.
+```
+```
+**Environment**
+- OS/container version: [e.g., container nvcr.io/nvidia/pytorch:23.02-py3 / virtual machine with Ubuntu 22.04]
+  - glibc version: [e.g., 2.31; can be checked with `ldd --version`]
+- Python interpreter distribution and version: [e.g., CPython 3.8 / conda 4.7.12 with Python 3.8 environment]
+- pip version: [e.g., 23.1.2]
+- PyTriton version: [e.g., 0.1.4 / custom build from source at commit ______]
+- Deployment details: [e.g., multi-node multi-GPU setup on GKE / multi-GPU single-node setup in Jupyter Notebook]
+**Additional context**
+Add any other context about the problem here.

stf/stf-api-alternative/pytriton/.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.

stf/stf-api-alternative/pytriton/.github/workflows/stale.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: "30 1 * * *"
+jobs:
+  stale:
+    if: github.repository_owner == 'triton-inference-server'
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v8
+        with:
+          days-before-stale: 21
+          days-before-close: 7
+          stale-issue-message: 'This issue is stale because it has been open 21 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
+          stale-pr-message: 'This PR is stale because it has been open 21 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
+          close-issue-message: 'This issue was closed because it has been stalled for 7 days with no activity.'
+          close-pr-message: 'This PR was closed because it has been stalled for 7 days with no activity.'
+          exempt-issue-labels: 'non-stale'
+          exempt-pr-labels: 'non-stale'

stf/stf-api-alternative/pytriton/.gitignore ADDED Viewed

	@@ -0,0 +1,330 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Created by https://www.toptal.com/developers/gitignore/api/pycharm+all,visualstudiocode,python,direnv,vim
+# Edit at https://www.toptal.com/developers/gitignore?templates=pycharm+all,visualstudiocode,python,direnv,vim
+### direnv ###
+.direnv
+.envrc
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### PyCharm+all Patch ###
+# Ignore everything but code style settings and run configurations
+# that are supposed to be shared within teams.
+.idea/*
+!.idea/codeStyles
+!.idea/runConfigurations
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+# Session
+Session.vim
+Sessionx.vim
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# End of https://www.toptal.com/developers/gitignore/api/pycharm+all,visualstudiocode,python,direnv,vim
+pytriton/tritonserver
+docs/CHANGELOG.md
+docs/CONTRIBUTING.md
+docs/LICENSE.md
+docs/examples.md
+### VisualStudioCode+all ##
+.vscode
+.devcontainer

stf/stf-api-alternative/pytriton/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+exclude: kubernetes
+repos:
+  - repo: https://github.com/ambv/black
+    rev: 23.11.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-docstring-first
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: detect-private-key
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+        types: [python]
+      - id: fix-byte-order-marker
+      - id: no-commit-to-branch
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+        exclude: setup.cfg
+      - id: mixed-line-ending
+        args: [--fix=lf]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.15.0
+    hooks:
+      - id: pyupgrade
+        args: [--py36-plus]
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        additional_dependencies:
+            - flake8-bugbear
+            - flake8-comprehensions
+            - flake8-print
+            - mccabe
+            - pep8-naming
+            - pycodestyle
+            - pyflakes
+  - repo: https://github.com/pycqa/pydocstyle
+    rev: 6.3.0
+    hooks:
+      - id: pydocstyle
+        name: Run pydocstyle
+        args:
+          - --convention=google
+        exclude: '(?:tests|examples)\/.*'
+        additional_dependencies: ['toml']
+  - repo: https://github.com/thlorenz/doctoc
+    rev: v2.2.0
+    hooks:
+      - id: doctoc
+        args: [ --github, --update-only ]

stf/stf-api-alternative/pytriton/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,239 @@

+<!--
+Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Changelog
+## 0.4.2 (2023-12-05)
+- New: You can create client from existing client instance or model configuration to avoid loading model configuration from server.
+- New: Introduced warning system using the `warnings` module.
+- Fix: Experimental client for decoupled models prevents sending another request, when responses from previous request are not consumed, blocks close until stream is stopped.
+- Fix: Leak of ModelClient during Triton creation
+- Fix: Fixed non-declared project dependencies (removed from use in code or added to package dependencies)
+- Fix: Remote model is being unloaded from Triton when RemoteTriton is closed.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.39.0](https://github.com/triton-inference-server/server/releases/tag/v2.39.0)
+## 0.4.1 (2023-11-09)
+- New: Place where workspaces with temporary Triton model repositories and communication file sockets can be configured by `$PYTRITON_HOME` environment variable
+- Fix: Recover handling `KeyboardInterrupt` in `triton.serve()`
+- Fix: Remove limit for handling bytes dtype tensors
+- Build scripts update
+  - Added support for arm64 platform builds
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.39.0](https://github.com/triton-inference-server/server/releases/tag/v2.39.0)
+## 0.4.0 (2023-10-20)
+- New: Remote Mode - PyTriton can be used to connect to a remote Triton Inference Server
+  - Introduced RemoteTriton class which can be used to connect to a remote Triton Inference Server
+    running on the same machine, by passing triton url.
+  - Changed Triton lifecycle - now the Triton Inference Server is started while entering the context.
+    This allows to load models dynamically to the running server while calling the bind method.
+    It is still allowed to create Triton instance without entering the context and bind models before starting
+    the server (in this case the models are lazy loaded when calling run or serve method like it worked before).
+  - In RemoteTriton class, calling __enter__ or connect method connects to triton server, so we can safely load models
+    while binding inference functions (if RemoteTriton is used without context manager, models are lazy loaded
+    when calling connect or serve method).
+- Change: `@batch` decorator raises a `ValueError` if any of the outputs have a different batch size than expected.
+- fix: gevent resources leak in ``FuturesModelClient``
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.36.0](https://github.com/triton-inference-server/server/releases/tag/v2.36.0)
+## 0.3.1 (2023-09-26)
+- Change: `KeyboardInterrupt` is now handled in `triton.serve()`. PyTriton hosting scripts return an exit code of 0 instead of 130 when they receive a SIGINT signal.
+- Fix: Addressed potential instability in shared memory management.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.36.0](https://github.com/triton-inference-server/server/releases/tag/v2.36.0)
+## 0.3.0 (2023-09-05)
+- new: Support for multiple Python versions starting from 3.8+
+- new: Added support for [decoupled models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) enabling to support streaming models (alpha state)
+- change: Upgraded Triton Inference Server binaries to version 2.36.0. Note that this Triton Inference Server requires glibc 2.35+ or a more recent version.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.36.0](https://github.com/triton-inference-server/server/releases/tag/v2.36.0)
+## 0.2.5 (2023-08-24)
+- new: Allow to execute multiple PyTriton instances in the same process and/or host
+- fix: Invalid flags for Proxy Backend configuration passed to Triton
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.33.0](https://github.com/triton-inference-server/server/releases/tag/v2.33.0)
+## 0.2.4 (2023-08-10)
+- new: Introduced `strict` flag in `Triton.bind` which enables data types and shapes validation of inference callable outputs
+  against model config
+- new: `AsyncioModelClient` which works in FastAPI and other async frameworks
+- fix: `FuturesModelClient` do not raise `gevent.exceptions.InvalidThreadUseError`
+- fix: Do not throw TimeoutError if could not connect to server during model verification
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.33.0](https://github.com/triton-inference-server/server/releases/tag/v2.33.0)
+## 0.2.3 (2023-07-21)
+- Improved verification of Proxy Backend environment when running under same Python interpreter
+- Fixed pytriton.__version__ to represent currently installed version
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.33.0](https://github.com/triton-inference-server/server/releases/tag/v2.33.0)
+## 0.2.2 (2023-07-19)
+- Added `inference_timeout_s` parameters to client classes
+- Renamed `PyTritonClientUrlParseError` to `PyTritonClientInvalidUrlError`
+- `ModelClient` and `FuturesModelClient` methods raise `PyTritonClientClosedError` when used after client is closed
+- Pinned tritonclient dependency due to issues with tritonclient >= 2.34 on systems with glibc version lower than 2.34
+- Added warning after Triton Server setup and teardown while using too verbose logging level as it may cause a significant performance drop in model inference
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.33.0](https://github.com/triton-inference-server/server/releases/tag/v2.33.0)
+## 0.2.1 (2023-06-28)
+- Fixed handling `TritonConfig.cache_directory` option - the directory was always overwritten with the default value.
+- Fixed tritonclient dependency - PyTriton need tritonclient supporting http headers and parameters
+- Improved shared memory usage to match 64MB limit (default value for Docker, Kubernetes) reducing the initial size for PyTriton Proxy Backend.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.33.0](https://github.com/triton-inference-server/server/releases/tag/v2.33.0)
+## 0.2.0 (2023-05-30)
+- Added support for using custom HTTP/gRPC request headers and parameters.
+  This change breaks backward compatibility of the inference function signature.
+  The undecorated inference function now accepts a list of `Request` instances instead
+  of a list of dictionaries. The `Request` class contains data for inputs and parameters
+  for combined parameters and headers.
+  See [docs/custom_params.md](docs/custom_params.md) for further information
+- Added `FuturesModelClient` which enables sending inference requests in a parallel manner.
+- Added displaying documentation link after models are loaded.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of [Triton Inference Server](https://github.com/triton-inference-server/) embedded in wheel: [2.33.0](https://github.com/triton-inference-server/server/releases/tag/v2.33.0)
+## 0.1.5 (2023-05-12)
+- Improved `pytriton.decorators.group_by_values` function
+  - Modified the function to avoid calling the inference callable on each individual sample when grouping by string/bytes input
+  - Added `pad_fn` argument for easy padding and combining of the inference results
+- Fixed Triton binaries search
+- Improved Workspace management (remove workspace on shutdown)
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of external components used during testing:
+  - [Triton Inference Server](https://github.com/triton-inference-server/): 2.29.0
+  - Other component versions depend on the used framework and Triton Inference Server containers versions.
+    Refer to its [support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+    for a detailed summary.
+## 0.1.4 (2023-03-16)
+- Add validation of the model name passed to Triton bind method.
+- Add monkey patching of `InferenceServerClient.__del__` method to prevent unhandled exceptions.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of external components used during testing:
+  - [Triton Inference Server](https://github.com/triton-inference-server/): 2.29.0
+  - Other component versions depend on the used framework and Triton Inference Server containers versions.
+    Refer to its [support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+    for a detailed summary.
+## 0.1.3 (2023-02-20)
+- Fixed getting model config in `fill_optionals` decorator.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of external components used during testing:
+  - [Triton Inference Server](https://github.com/triton-inference-server/): 2.29.0
+  - Other component versions depend on the used framework and Triton Inference Server containers versions.
+    Refer to its [support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+    for a detailed summary.
+## 0.1.2 (2023-02-14)
+- Fixed wheel build to support installations on operating systems with glibc version 2.31 or higher.
+- Updated the documentation on custom builds of the package.
+- Change: TritonContext instance is shared across bound models and contains model_configs dictionary.
+- Fixed support of binding multiple models that uses methods of the same class.
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of external components used during testing:
+  - [Triton Inference Server](https://github.com/triton-inference-server/): 2.29.0
+  - Other component versions depend on the used framework and Triton Inference Server containers versions.
+    Refer to its [support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+    for a detailed summary.
+## 0.1.1 (2023-01-31)
+- Change: The `@first_value` decorator has been updated with new features:
+  - Renamed from `@first_values` to `@first_value`
+  - Added a `strict` flag to toggle the checking of equality of values on a single selected input of the request. Default is True
+  - Added a `squeeze_single_values` flag to toggle the squeezing of single value ND arrays to scalars. Default is True
+- Fix: `@fill_optionals` now supports non-batching models
+- Fix: `@first_value` fixed to work with optional inputs
+- Fix: `@group_by_values` fixed to work with string inputs
+- Fix: `@group_by_values` fixed to work per sample-wise
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of external components used during testing:
+  - [Triton Inference Server](https://github.com/triton-inference-server/): 2.29.0
+  - Other component versions depend on the used framework and Triton Inference Server containers versions.
+    Refer to its [support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+    for a detailed summary.
+## 0.1.0 (2023-01-12)
+- Initial release of PyTriton
+[//]: <> (put here on external component update with short summary what change or link to changelog)
+- Version of external components used during testing:
+  - [Triton Inference Server](https://github.com/triton-inference-server/): 2.29.0
+  - Other component versions depend on the used framework and Triton Inference Server containers versions.
+    Refer to its [support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+    for a detailed summary.

stf/stf-api-alternative/pytriton/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,203 @@

+<!--
+Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Contributing
+Contributions are welcome, and they are much appreciated! Every little
+helps, and we will always give credit.
+## Types of Contributions
+### Report Bugs
+Report bugs at [https://github.com/triton-inference-server/pytriton/issues](https://github.com/triton-inference-server/pytriton/issues).
+When reporting a bug, please include the following information:
+* Your operating system name and version.
+* Any details about your local setup that might be helpful in troubleshooting.
+* Detailed steps to reproduce the bug.
+### Fix Bugs
+Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
+wanted" is open to whoever wants to implement it.
+### Implement Features
+Browse through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it.
+### Write Documentation
+The PyTriton could always use more documentation, whether as part of
+the official PyTriton docs, in docstrings, or even on the web in blog posts,
+articles, and such.
+### Submit Feedback
+The best way to send feedback is to file an issue at [https://github.com/triton-inference-server/pytriton/issues](https://github.com/triton-inference-server/pytriton/issues).
+If you are proposing a feature:
+* Explain in detail how it would work.
+* Keep the scope as narrow as possible to make it easier to implement.
+## Sign your Work
+We require that all contributors "sign-off" on their commits. This certifies that
+the contribution is your original work, or you have the rights to submit it under
+the same license or a compatible license.
+Any contribution which contains commits that are not Signed-Off will not be accepted.
+To sign off on a commit, simply use the `--signoff` (or `-s`) option when committing your changes:
+```shell
+$ git commit -s -m "Add a cool feature."
+```
+This will append the following to your commit message:
+```
+Signed-off-by: Your Name <your@email.com>
+```
+By doing this, you certify the following:
+```
+Developer Certificate of Origin
+Version 1.1
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+1 Letterman Drive
+Suite D4700
+San Francisco, CA, 94129
+Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+Developer's Certificate of Origin 1.1
+By making a contribution to this project, I certify that:
+(a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
+(b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
+(c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
+(d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
+```
+## Get Started!
+### Local Development
+Ready to contribute? Here's how to set up the `PyTriton` for local development.
+1. Fork the `PyTriton` repo on GitHub.
+2. Clone your fork locally:
+```shell
+$ git clone git@github.com:your_name_here/pytriton.git
+```
+3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, here's how you set up your fork for local development:
+```shell
+$ mkvirtualenv pytriton
+$ cd pytriton/
+```
+If you do not use the virtualenvwrapper package, you can initialize a virtual environment using the pure Python command:
+```shell
+$ python -m venv pytriton
+$ cd pytriton/
+$ source bin/activate
+```
+Once the virtualenv is activated, install the development dependencies:
+```shell
+$ make install-dev
+```
+4. Extract Triton Server to your environment so you can debug PyTriton while serving some models on Triton:
+```shell
+$ make extract-triton
+```
+5. Install pre-commit hooks:
+```shell
+$ pre-commit install
+```
+6. Create a branch for local development:
+```shell
+$ git checkout -b name-of-your-bugfix-or-feature
+```
+Now you can make your changes locally.
+7. When you're done making changes, check that your changes pass linters and the
+   tests, including testing other Python versions with tox:
+```shell
+$ make lint  # will run, among others, flake8 and pytype linters
+$ make test  # will run a test on your current virtualenv
+```
+  To run a subset of tests:
+```shell
+$ pytest tests.test_subset
+```
+8. Commit your changes and push your branch to GitHub:
+```shell
+$ git add .
+$ git commit -s -m "Your detailed description of your changes."
+$ git push origin name-of-your-bugfix-or-feature
+```
+9. Submit a pull request through the GitHub website.
+### Pull Request Guidelines
+Before you submit a pull request, check that it meets these guidelines:
+1. The pull request should include tests.
+2. If the pull request adds functionality, you should update the docs. Put your new functionality into a function with a docstring and add the feature to the list in README.md.
+## Documentation
+Add/update docstrings as defined in [Google Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings).
+## Contributor License Agreement (CLA)
+PyTriton requires that all contributors (or their corporate entity) send
+a signed copy of the [Contributor License
+Agreement](https://github.com/NVIDIA/triton-inference-server/blob/master/Triton-CCLA-v1.pdf)
+to triton-cla@nvidia.com.
+*NOTE*: Contributors with no company affiliation can fill `N/A` in the
+`Corporation Name` and `Corporation Address` fields.

stf/stf-api-alternative/pytriton/COPYRIGHT ADDED Viewed

	@@ -0,0 +1,13 @@

+Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

stf/stf-api-alternative/pytriton/LICENSE ADDED Viewed

	@@ -0,0 +1,174 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.

stf/stf-api-alternative/pytriton/Makefile ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+.PHONY: clean clean-build clean-tritonserver clean-pyc clean-docs clean-test docs lint test coverage release dist build-triton extract-triton install install-dev help
+.DEFAULT_GOAL := help
+define BROWSER_PYSCRIPT
+import os, webbrowser, sys
+from urllib.request import pathname2url
+webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
+endef
+export BROWSER_PYSCRIPT
+define PRINT_HELP_PYSCRIPT
+import re, sys
+for line in sys.stdin:
+	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
+	if match:
+		target, help = match.groups()
+		print("%-20s %s" % (target, help))
+endef
+export PRINT_HELP_PYSCRIPT
+BROWSER := python -c "$$BROWSER_PYSCRIPT"
+PIP_INSTALL := pip install --extra-index-url https://pypi.ngc.nvidia.com
+TRITONSERVER_IMAGE_VERSION = 23.10
+TRITONSERVER_IMAGE_NAME = nvcr.io/nvidia/tritonserver:$(TRITONSERVER_IMAGE_VERSION)-pyt-python-py3
+TRITONSERVER_OUTPUT_DIR = ${PWD}/pytriton/tritonserver
+TRITONSERVER_BASENAME = pytriton
+PYTRITON_IMAGE_NAME = $(TRITONSERVER_BASENAME):$(TRITONSERVER_IMAGE_VERSION)
+# to set PLATFORM from outside, use: make PLATFORM=linux/arm64;
+# correct values are: linux/amd64 (default), linux/arm64
+PLATFORM=linux/amd64
+help:
+	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
+clean: clean-build clean-pyc clean-test clean-tritonserver clean-docs ## remove all build, tritonserver, test, docs, coverage and Python artifacts
+clean-build: ## remove build artifacts
+	rm -fr build/
+	rm -fr dist/
+	rm -fr .eggs/
+	find . -name '*.egg-info' -exec rm -fr {} +
+	find . -name '*.egg' -exec rm -f {} +
+clean-tritonserver:
+	rm -fr pytriton/tritonserver
+clean-pyc: ## remove Python file artifacts
+	find . -name '*.pyc' -exec rm -f {} +
+	find . -name '*.pyo' -exec rm -f {} +
+	find . -name '*~' -exec rm -f {} +
+	find . -name '__pycache__' -exec rm -fr {} +
+clean-docs: ## remove test and coverage artifacts
+	rm -rf site
+clean-test: ## remove test and coverage artifacts
+	rm -fr .tox/
+	rm -f .coverage
+	rm -fr htmlcov/
+	rm -fr .pytest_cache
+	rm -fr .pytype/
+docs: clean-docs ## generate site
+	cp CHANGELOG.md docs
+	cp CONTRIBUTING.md docs
+	cp LICENSE docs/LICENSE.md
+	cp examples/README.md docs/examples.md
+	mkdocs build --clean
+docs-serve: docs
+	mkdocs serve
+lint: ## check style with pre-commit and pytype
+	tox -e pytype,pre-commit --develop
+test: ## run tests on every Python version with tox
+	tox --develop --skip-missing-interpreters
+coverage: ## check code coverage quickly with the default Python
+	coverage run --source pytriton -m pytest
+	coverage report -m
+	coverage html
+	$(BROWSER) htmlcov/index.html
+dist: clean build-triton extract-triton ## builds source and wheel package
+	bash ./scripts/build_wheel.sh $(PLATFORM)
+	ls -lh dist
+	find ./dist -iname *-linux*.whl -type f -exec bash ./scripts/add_libs_to_wheel.sh $(PYTRITON_IMAGE_NAME) $(TRITONSERVER_OUTPUT_DIR) {} $(PLATFORM) \;
+	find ./dist -iname *-linux*.whl -type f -delete
+	ls -lh dist
+	twine check dist/*
+build-triton: ## build Triton with Python Stubs
+	bash ./scripts/build_triton.sh $(TRITONSERVER_IMAGE_NAME) $(PYTRITON_IMAGE_NAME) $(PLATFORM)
+	echo "export PYTRITON_IMAGE_NAME=$(PYTRITON_IMAGE_NAME)" > .env
+extract-triton: build-triton ## extract Triton binaries and libraries
+	# changing dst path, change also in clean-build and pyproject.toml
+	bash ./scripts/extract_triton.sh $(PYTRITON_IMAGE_NAME) $(TRITONSERVER_OUTPUT_DIR) $(PLATFORM)
+install: clean extract-triton ## install the package to the active Python's site-packages
+	$(PIP_INSTALL) --upgrade pip
+	$(PIP_INSTALL) .
+install-dev: clean-build clean-pyc
+	$(PIP_INSTALL) --upgrade pip
+	$(PIP_INSTALL) -e .[dev]

stf/stf-api-alternative/pytriton/README.md ADDED Viewed

	@@ -0,0 +1,343 @@

+<!--
+Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# PyTriton
+PyTriton is a Flask/FastAPI-like interface that simplifies Triton's deployment in Python environments.
+The library allows serving Machine Learning models directly from Python through
+NVIDIA's [Triton Inference Server](https://github.com/triton-inference-server).
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+- [Documentation](#documentation)
+- [Feature matrix](#feature-matrix)
+- [How it works?](#how-it-works)
+- [Installation](#installation)
+  - [Prerequisites](#prerequisites)
+  - [Install from `pypi`](#install-from-pypi)
+  - [Setting Up Python Environment](#setting-up-python-environment)
+  - [Building binaries from source](#building-binaries-from-source)
+- [Quick Start](#quick-start)
+- [Architecture](#architecture)
+- [Examples](#examples)
+  - [Streaming (alpha)](#streaming-alpha)
+  - [Profiling model](#profiling-model)
+- [Version management](#version-management)
+- [Useful Links](#useful-links)
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+## Documentation
+Read how to customize the Triton Inference Server, load models, deploy on clusters, and the API reference
+can be found in the [documentation](https://triton-inference-server.github.io/pytriton). The below sections provide
+brief information about the product and quick start guide.
+## Feature matrix
+| Feature | Description |
+| ------- | ----------- |
+| Native Python support | You can create any Python function and expose it as an HTTP/gRPC API. |
+| Framework-agnostic | You can run any Python code with any framework of your choice, such as: PyTorch, TensorFlow, or JAX. |
+| Performance optimization | You can benefit from dynamic batching, response cache, model pipelining, and GPU/CPU inference. |
+| Easy installation and setup | You can use a simple and familiar interface based on Flask/FastAPI for easy installation and setup.  |
+| Model clients   | You can access high-level model clients for HTTP/gRPC requests with configurable options and both synchronous and asynchronous API. |
+| Streaming (alpha) | You can stream partial responses from a model by serving it in a decoupled mode. |
+## How it works?
+In PyTriton, like in Flask or FastAPI, you can define any Python function that executes a Machine Learning model prediction and exposes
+it through an HTTP/gRPC API. PyTriton installs Triton Inference Server in your environment and uses it for handling
+HTTP/gRPC requests and responses. Our library provides a Python API that allows you to attach a Python function to Triton
+and a communication layer to send/receive data between Triton and the function. The solution enables using the
+performance features of Triton Inference Server, such as dynamic batching or response cache, without changing your model
+environment. Thus, it improves the performance of running inference on GPU for models implemented in Python. The solution is
+framework-agnostic and can be used along with frameworks like PyTorch, TensorFlow, or JAX.
+## Installation
+We assume that you are comfortable with the Python programming language and familiar with Machine Learning models.
+Using [Docker](https://www.docker.com/) is an option, but not mandatory.
+The library can be installed in:
+- system environment
+- virtualenv
+- [Docker](https://www.docker.com/) image
+NVIDIA optimized Docker images for Python frameworks can be obtained from the [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/containers).
+If you want to use the Docker runtime, we recommend that you install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html) to
+enable running model inference on NVIDIA GPU.
+### Prerequisites
+Before installing the library, ensure that you meet the following requirements:
+- An operating system with glibc >= `2.35`.
+  - Triton Inference Server and PyTriton have **only** been rigorously tested on Ubuntu 22.04.
+  - Other supported operating systems include Ubuntu Debian 11+, Rocky Linux 9+, and Red Hat Universal Base Image 9+.
+  - To check your glibc version, run `ldd --version`
+- Python version >= `3.8`
+- Use `pip >= `20.3`
+- Install `libpython3.*.so` in the operating system (appropriate for Python version).
+### Install from `pypi`
+The PyTriton can be installed from [pypi.org](https://pypi.org/project/nvidia-pytriton/) by running the following command:
+```shell
+pip install -U nvidia-pytriton
+```
+**Important**: The Triton Inference Server binary is installed as part of the PyTriton package.
+More details about installation can be found in the [documentation](https://triton-inference-server.github.io/pytriton/latest/installation/).
+### Setting Up Python Environment
+The PyTriton requires installation and linking `libpython3.*.so`. Read more in "[Setting Up Python Environment](https://triton-inference-server.github.io/pytriton/latest/installation#setting-up-python-environment)"
+for additional information how to configure system for different Python versions.
+### Building binaries from source
+The binary package can be built from the source, allowing access to unreleased hotfixes, the ability to modify the PyTriton code, and compatibility with various Triton Inference Server versions, including custom server builds.
+For further information on building the PyTriton binary, refer to the [Building](https://triton-inference-server.github.io/pytriton/latest/building/) page of documentation.
+## Quick Start
+The quick start presents how to run Python model in Triton Inference Server without need to change the current working
+environment. In the example we are using a simple `Linear` PyTorch model.
+The requirement for the example is to have installed PyTorch in your environment. You can do it running:
+```shell
+pip install torch
+```
+The integration of model requires to provide following elements:
+- The model - framework or Python model or function that handle inference requests
+- Inference callback - a lambda or function which handle the input data coming from Triton and return the result
+- Python function connection with Triton Inference Server - a binding for communication between Triton and Python
+  callback
+In the next step define the `Linear` model:
+```python
+import torch
+model = torch.nn.Linear(2, 3).to("cuda").eval()
+```
+In the second step, create an inference callable as a function. The function obtains the HTTP/gRPC request data as an argument, which should be in the form of a NumPy array. The expected return object should also be a NumPy array. You can define an inference callable as a function that uses the `@batch` decorator from PyTriton. This decorator converts the input request into a more suitable format that can be directly passed to the model. You can read more about [decorators here](docs/decorators.md).
+Example implementation:
+<!--pytest-codeblocks:cont-->
+```python
+import numpy as np
+from pytriton.decorators import batch
+@batch
+def infer_fn(**inputs: np.ndarray):
+    (input1_batch,) = inputs.values()
+    input1_batch_tensor = torch.from_numpy(input1_batch).to("cuda")
+    output1_batch_tensor = model(input1_batch_tensor)  # Calling the Python model inference
+    output1_batch = output1_batch_tensor.cpu().detach().numpy()
+    return [output1_batch]
+```
+In the next step, you can create the binding between the inference callable and Triton Inference Server using the `bind` method from pyTriton. This method takes the model name, the inference callable, the inputs and outputs tensors, and an optional model configuration object.
+<!--pytest-codeblocks:cont-->
+```python
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton
+# Connecting inference callable with Triton Inference Server
+with Triton() as triton:
+    # Load model into Triton Inference Server
+    triton.bind(
+        model_name="Linear",
+        infer_func=infer_fn,
+        inputs=[
+            Tensor(dtype=np.float32, shape=(-1,)),
+        ],
+        outputs=[
+            Tensor(dtype=np.float32, shape=(-1,)),
+        ],
+        config=ModelConfig(max_batch_size=128)
+    )
+    ...
+```
+Finally, serve the model with the Triton Inference Server:
+<!--pytest.mark.skip-->
+```python
+from pytriton.triton import Triton
+with Triton() as triton:
+    ...  # Load models here
+    triton.serve()
+```
+The `bind` method creates a connection between the Triton Inference Server and the `infer_fn`, which handles
+the inference queries. The `inputs` and `outputs` describe the model inputs and outputs that are exposed in
+Triton. The config field allows more parameters for model deployment.
+The `serve` method is blocking, and at this point, the application waits for incoming HTTP/gRPC requests. From that
+moment, the model is available under the name `Linear` in the Triton server. The inference queries can be sent to
+`localhost:8000/v2/models/Linear/infer`, which are passed to the `infer_fn` function.
+If you would like to use Triton in the background mode, use `run`. More about that can be found
+in the [Deploying Models](https://triton-inference-server.github.io/pytriton/latest/initialization/) page.
+Once the `serve` or `run` method is called on the `Triton` object, the server status can be obtained using:
+<!--pytest.mark.skip-->
+```shell
+curl -v localhost:8000/v2/health/live
+```
+The model is loaded right after the server starts, and its status can be queried using:
+<!--pytest.mark.skip-->
+```shell
+curl -v localhost:8000/v2/models/Linear/ready
+```
+Finally, you can send an inference query to the model:
+<!--pytest.mark.skip-->
+```shell
+curl -X POST \
+  -H "Content-Type: application/json"  \
+  -d @input.json \
+  localhost:8000/v2/models/Linear/infer
+```
+The `input.json` with sample query:
+```json
+{
+  "id": "0",
+  "inputs": [
+    {
+      "name": "INPUT_1",
+      "shape": [1, 2],
+      "datatype": "FP32",
+      "parameters": {},
+      "data": [[-0.04281254857778549, 0.6738349795341492]]
+    }
+  ]
+}
+```
+Read more about the HTTP/gRPC interface in the Triton Inference Server
+[documentation](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols).
+You can also validate the deployed model using a simple client that can perform inference requests:
+<!--pytest.mark.skip-->
+```python
+import torch
+from pytriton.client import ModelClient
+input1_data = torch.randn(128, 2).cpu().detach().numpy()
+with ModelClient("localhost:8000", "Linear") as client:
+    result_dict = client.infer_batch(input1_data)
+print(result_dict)
+```
+The full example code can be found in [examples/linear_random_pytorch](examples/linear_random_pytorch).
+You can learn more about client usage in the [Clients](https://triton-inference-server.github.io/pytriton/latest/clients/) document.
+More information about running the server and models can be found
+in [Deploying Models](https://triton-inference-server.github.io/pytriton/latest/initialization/) page of documentation.
+## Architecture
+The diagram below presents the schema of how the Python models are served through Triton Inference Server using
+PyTriton. The solution consists of two main components:
+- Triton Inference Server: for exposing the HTTP/gRPC API and benefiting from performance features like dynamic batching
+  or response cache.
+- Python Model Environment: your environment where the Python model is executed.
+The Triton Inference Server binaries are provided as part of the PyTriton installation. The Triton Server is
+installed in your current environment (system or container). The PyTriton controls the Triton Server process
+through the `Triton Controller`.
+Exposing the model through PyTriton requires the definition of an `Inference Callable` - a Python function that is
+connected to Triton Inference Server and executes the model or ensemble for predictions. The integration layer binds
+the `Inference Callable` to Triton Server and exposes it through the Triton HTTP/gRPC API under a provided `<model name>`. Once
+the integration is done, the defined `Inference Callable` receives data sent to the HTTP/gRPC API endpoint
+`v2/models/<model name>/infer`. Read more about HTTP/gRPC interface in Triton Inference Server
+[documentation](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols).
+The HTTP/gRPC requests sent to `v2/models/<model name>/infer` are handled by Triton
+Inference Server. The server batches requests and passes them to the `Proxy Backend`, which sends the batched requests to the appropriate
+`Inference Callable`. The data is sent as a `numpy` array. Once the `Inference Callable` finishes execution of
+the model prediction, the result is returned to the `Proxy Backend`, and a response is created by Triton Server.
+![High Level Design](docs/assets/hld.svg)
+## Examples
+The [examples](examples) page presents various cases of serving models using PyTriton. You can find simple examples of
+running PyTorch, TensorFlow2, JAX, and simple Python models. Additionally, we have prepared more advanced scenarios like online
+learning, multi-node models, or deployment on Kubernetes using PyTriton. Each example contains instructions describing
+how to build and run the example. Learn more about how to use PyTriton by reviewing our [examples](examples).
+### Streaming (alpha)
+We introduced new alpha feature to PyTriton that allows to stream partial responses from a model. It is based on NVIDIA Triton Inference deocoupled models feature. Look at example in [examples/huggingface_dialogpt_streaming_pytorch](examples/huggingface_dialogpt_streaming_pytorch).
+### Profiling model
+The [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) can be
+used to profile models served through PyTriton. We have prepared an example of
+using the Perf Analyzer to profile the BART PyTorch model. The example code can be found
+in [examples/perf_analyzer](examples/perf_analyzer).
+## Version management
+PyTriton follows the [Semantic Versioning](https://semver.org/) scheme for versioning. Official releases can be found on [PyPI](https://pypi.org/project/nvidia-pytriton/) and [GitHub releases](https://github.com/triton-inference-server/pytriton/releases). The most up-to-date development version is available on the `main` branch, which may include hotfixes that have not yet been released through the standard channels. To install the latest development version, refer to the instructions in the
+[building binaries from source](#building-binaries-from-source) section.
+## Useful Links
+- [Changelog](CHANGELOG.md)
+- [Known Issues](https://triton-inference-server.github.io/pytriton/latest/known_issues)
+- [Contributing](CONTRIBUTING.md)

stf/stf-api-alternative/pytriton/build/lib/pytriton/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104
+from importlib.metadata import PackageNotFoundError, version
+try:
+    __version__ = version("nvidia-pytriton")
+except PackageNotFoundError:
+    # package is not installed
+    pass
+from pytriton import (
+    client,  # noqa: F401
+    model_config,  # noqa: F401
+    triton,  # noqa: F401
+)

stf/stf-api-alternative/pytriton/build/lib/pytriton/__main__.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pytriton check module."""
+import logging
+import os
+import pathlib
+import shutil
+import tempfile
+from typing import Optional
+import typer
+from typing_extensions import Annotated
+from pytriton.check.add_sub import add_sub_example, add_sub_example_thread
+from pytriton.check.env_checks import env_checks
+warning_message = """
++---------------------------------------------------------------+
+|                             WARNING                           |
++---------------------------------------------------------------+
+| Command may collect sensitive information, please review the  |
+| log and the ZIP before sharing.                               |
++---------------------------------------------------------------+
+"""
+app = typer.Typer(help="Pytriton check tool.\n\nThis tool is used to check the environment and run examples.")
+class CheckEnvironment:
+    """Check environment class.
+    Args:
+        workspace_path: Path to workspace
+        name: Name of the sub_workspace
+        zip_results: Flag if results should be zipped
+        check_workspace_exist: Flag if workspace should be checked if exists
+    """
+    def __init__(
+        self,
+        workspace_path: Optional[pathlib.Path],
+        name: str,
+        zip_results: bool = True,
+        check_workspace_exist: bool = True,
+    ):
+        """Initialize class."""
+        self.name = name
+        self._zip_results = zip_results
+        self._temp_workspace = None
+        self.logger = logging.getLogger(name)
+        if check_workspace_exist and workspace_path is not None and workspace_path.exists():
+            self.logger.error(f"Workspace path {workspace_path} already exists")
+            raise typer.Exit(code=1)
+        if workspace_path is None:
+            self._temp_workspace = tempfile.TemporaryDirectory(prefix="pytriton_workspace_")
+            workspace_path = pathlib.Path(self._temp_workspace.name)
+        else:
+            workspace_path.mkdir(parents=True, exist_ok=True)
+        logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s")
+        self.logger.addHandler(logging.FileHandler(workspace_path / (name + "_log.txt")))
+        self.workspace_path = workspace_path
+        self.sub_workspace = workspace_path / name
+    def __enter__(self):
+        """Enter method."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Exit method zips results if required."""
+        self.zip_results()
+    def zip_results(self):
+        """Zip results."""
+        if self._zip_results:
+            if self.workspace_path.exists():
+                if self._temp_workspace is not None:
+                    output_file_base = pathlib.Path(os.getcwd()) / self.workspace_path.name
+                else:
+                    output_file_base = self.workspace_path
+                self.logger.info(f"Zipping {self.workspace_path} to {output_file_base}.zip")
+                shutil.make_archive(str(output_file_base.resolve()), "zip", str(self.workspace_path.resolve()))
+            else:
+                self.logger.error(f"Workspace path {self.workspace_path} does not exist")
+@app.command("example-add-sub-script")
+def example_add_sub_script(
+    workspace: Annotated[Optional[pathlib.Path], typer.Option("--workspace", "-w")] = None,
+    zip_results: Annotated[bool, typer.Option("--zip")] = True,
+):
+    """Run example using external script.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        zip_results: flag if output should be zipped
+    """
+    with CheckEnvironment(workspace, "example_add_sub_script", zip_results) as ce:
+        try:
+            add_sub_example_thread(ce.sub_workspace, ce.logger)
+        except Exception as e:
+            ce.logger.error(f"Error occurred in command: {e}")
+@app.command("example-add-sub")
+def example_add_sub(
+    workspace: Annotated[Optional[pathlib.Path], typer.Option("--workspace", "-w")] = None,
+    zip_results: Annotated[bool, typer.Option("--zip")] = True,
+):
+    """Run example.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        zip_results: flag if output should be zipped
+    """
+    with CheckEnvironment(workspace, "example_add_sub", zip_results) as ce:
+        try:
+            add_sub_example(ce.sub_workspace, ce.logger)
+        except Exception as e:
+            ce.logger.error(f"Error occurred in command: {e}")
+@app.command("examples")
+def examples(
+    workspace: Annotated[Optional[pathlib.Path], typer.Option("--workspace", "-w")] = None,
+    zip_results: Annotated[bool, typer.Option("--zip")] = True,
+):
+    """Run example in the same process.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        zip_results: flag if output should be zipped
+    """
+    with CheckEnvironment(workspace, "example_add_sub", zip_results) as ce:
+        try:
+            add_sub_example(ce.sub_workspace, ce.logger)
+        except Exception as e:
+            ce.logger.error(f"Error occurred in command: {e}")
+    with CheckEnvironment(workspace, "example_add_sub_script", zip_results, check_workspace_exist=False) as ce:
+        try:
+            add_sub_example_thread(ce.sub_workspace, ce.logger)
+        except Exception as e:
+            ce.logger.error(f"Error occurred in command: {e}")
+@app.command("env")
+def env_check(
+    workspace: Annotated[Optional[pathlib.Path], typer.Option("--workspace", "-w")] = None,
+    zip_results: Annotated[bool, typer.Option("--zip")] = True,
+):
+    """Run all environment checks.
+    It may collect sensitive system information in the log. Please review the log before sharing.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        zip_results: flag if output should be zipped
+    """
+    with CheckEnvironment(workspace, "env_checks", zip_results) as ce:
+        try:
+            env_checks(ce.logger)
+        except Exception as e:
+            ce.logger.error(f"Error occurred in command: {e}")
+@app.command("check")
+def check(
+    workspace: Annotated[Optional[pathlib.Path], typer.Option("--workspace", "-w")] = None,
+    zip_results: Annotated[bool, typer.Option("--zip")] = True,
+):
+    """Run all checks.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        zip_results: flag if output should be zipped
+    """
+    with CheckEnvironment(workspace, "all_checks", zip_results) as ce:
+        try:
+            ce.logger.info("Running all common checks")
+            env_check(ce.workspace_path / "env", False)
+            examples(ce.workspace_path / "examples", False)
+        except Exception as e:
+            ce.logger.error(f"Error occurred in command: {e}")
+@app.callback(invoke_without_command=True)
+def default_command(ctx: typer.Context):
+    """Default command."""
+    if ctx.invoked_subcommand is None:
+        check()
+def main():
+    """Main function."""
+    logger = logging.getLogger("PyTriton-Check")
+    try:
+        logger.warning(warning_message)
+        app()
+    finally:
+        logger.warning(warning_message)
+if __name__ == "__main__":
+    main()

stf/stf-api-alternative/pytriton/build/lib/pytriton/check/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104

stf/stf-api-alternative/pytriton/build/lib/pytriton/check/add_sub.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#!/usr/bin/env python3
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Add_sub example model for checking corectness of triton environment."""
+import argparse
+import logging
+import pathlib
+import signal
+import sys
+import numpy as np
+from pytriton.check.utils import ScriptThread
+from pytriton.client import ModelClient
+from pytriton.decorators import batch
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton
+logger = logging.getLogger("check.add_sub_example")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s")
+add_script_path = [sys.executable, "pytriton/check/add_sub.py"]
+@batch
+def _add_sub(**inputs):
+    a_batch, b_batch = inputs.values()
+    add_batch = a_batch + b_batch
+    sub_batch = a_batch - b_batch
+    return {"add": add_batch, "sub": sub_batch}
+def prepare_triton(workspace: pathlib.Path):
+    """Prepare triton server with AddSub model."""
+    triton = Triton(workspace=str(workspace.resolve()))
+    triton.run()
+    logger.info("Loading AddSub model")
+    triton.bind(
+        model_name="AddSub",
+        infer_func=_add_sub,
+        inputs=[
+            Tensor(dtype=np.float32, shape=(-1,)),
+            Tensor(dtype=np.float32, shape=(-1,)),
+        ],
+        outputs=[
+            Tensor(name="add", dtype=np.float32, shape=(-1,)),
+            Tensor(name="sub", dtype=np.float32, shape=(-1,)),
+        ],
+        config=ModelConfig(max_batch_size=128),
+        strict=True,
+    )
+    return triton
+def infer_add_sub_model():
+    """Infer AddSub model."""
+    batch_size = 2
+    a_batch = np.ones((batch_size, 1), dtype=np.float32)
+    b_batch = np.ones((batch_size, 1), dtype=np.float32)
+    logger.info(f"a: {a_batch.tolist()}")
+    logger.info(f"b: {b_batch.tolist()}")
+    with ModelClient("localhost", "AddSub") as client:
+        logger.info("Sending inference request")
+        result_batch = client.infer_batch(a_batch, b_batch)
+    for output_name, data_batch in result_batch.items():
+        logger.info(f"{output_name}: {data_batch.tolist()}")
+def serve_triton(workspace: pathlib.Path):
+    """Serve triton server with AddSub model."""
+    triton = prepare_triton(workspace)
+    logger.info("Serving AddSub model")
+    triton.serve()
+def add_sub_example_thread(workspace: pathlib.Path, logger: logging.Logger):
+    """Run example using external script.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        logger: logger instance
+    """
+    logger.info("Running example model using external script")
+    with ScriptThread(add_script_path + ["--workspace", str(workspace.resolve())], name="server") as server_thread:
+        import time
+        time.sleep(3)
+        infer_add_sub_model()
+        if server_thread.process:
+            server_thread.process.send_signal(signal.SIGINT)
+        server_thread.join()
+        logger.error(server_thread.output)
+        if server_thread.returncode not in [
+            0,
+            -2,
+        ]:
+            logger.error(f"Server failed - return code {server_thread.returncode}")
+def add_sub_example(workspace: pathlib.Path, logger: logging.Logger):
+    """Run example in the same process.
+    Args:
+        workspace: Workspace path that will be created to store testing output (should not exist)
+        logger: logger instance
+    """
+    logger.info("Running example model")
+    triton = prepare_triton(workspace)
+    infer_add_sub_model()
+    triton.stop()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workspace", help="Workspace path", type=str)
+    parser.add_argument("--infer", default=False, help="Infer AddSub model", action="store_true")
+    args = parser.parse_args()
+    if args.infer:
+        infer_add_sub_model()
+    else:
+        serve_triton(pathlib.Path(args.workspace))

stf/stf-api-alternative/pytriton/build/lib/pytriton/check/env_checks.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Environment checks."""
+import logging
+import os
+import pathlib
+import platform
+import re
+import sys
+import psutil
+from pytriton.check.utils import ScriptThread
+def nvidia_smi(logger):
+    """Run nvidia-smi.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Running nvidia-smi")
+    with ScriptThread(["nvidia-smi"], name="nvidia-smi") as nvidia_smi_thread:
+        nvidia_smi_thread.join()
+        logger.info(nvidia_smi_thread.output)
+        if nvidia_smi_thread.returncode != 0:
+            logger.error("nvidia-smi failed - possible cause: no GPU available or driver not installed")
+            logger.error(
+                "If running in WSL wit sudo, make sure to add nvidia-smi folder (e.g. /usr/lib/wsl/lib) to sudoers file!"
+            )
+def get_platform_info(logger):
+    """Get platform information (OS, python, etc.).
+    Args:
+        logger: logger instance
+    """
+    logger.info("Checking OS version")
+    logger.info("Script is running in docker:" + str(pathlib.Path("/.dockerenv").exists()))
+    os_release_path = pathlib.Path("/etc/os-release")
+    if os_release_path.exists():
+        with os_release_path.open() as f:
+            os_release = f.read()
+            logger.info("OS release")
+            logger.info(os_release)
+            for line in os_release.split("\n"):
+                if "PRETTY_NAME" in line:
+                    os_version = line.split("=")[1].strip()
+                    logger.info(f"OS version: {os_version}")
+    else:
+        logger.warning("OS release file not found (not available on some systems")
+    logger.info("Get platform info")
+    logger.info(f"Platform: {platform.platform()}")
+    logger.info(f"System: {platform.system()}")
+    logger.info(f"Release: {platform.release()}")
+    logger.info(f"Version: {platform.version()}")
+    logger.info(f"Machine: {platform.machine()}")
+    logger.info(f"Processor: {platform.processor()}")
+    logger.info(f"Python version: {platform.python_version()}")
+    logger.info(f"Python implementation: {platform.python_implementation()}")
+    logger.info(f"Python compiler: {platform.python_compiler()}")
+    logger.info(f"Python build: {platform.python_build()}")
+    logger.info(f"libc_ver: {platform.libc_ver()}")
+def check_psutil_stats(logger):
+    """Check psutil stats.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Checking psutil stats")
+    logger.info("Memory stats")
+    logger.info(psutil.virtual_memory())
+    logger.info("Swap stats")
+    logger.info(psutil.swap_memory())
+    logger.info("Disk stats")
+    logger.info(psutil.disk_usage("/"))
+    logger.info("Disk io countwers")
+    logger.info(psutil.disk_io_counters())
+    logger.info("CPU stats")
+    logger.info(psutil.cpu_times())
+    logger.info("Network stats")
+    logger.info(psutil.net_io_counters())
+def get_listening_processes(logger):
+    """Get listening processes.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Listening processes")
+    processes = {proc.pid: proc.name for proc in psutil.process_iter(["pid", "name"])}
+    connections = psutil.net_connections()
+    listening_sockets = [conn for conn in connections if conn.status == "LISTEN"]
+    for listening_socket in listening_sockets:
+        process_name = None
+        if listening_socket.pid is not None and listening_socket.pid in processes:
+            process_name = processes[listening_socket.pid]
+        logger.info(
+            f"Process ID: {listening_socket.pid}, Name: {process_name}, Local Address: {listening_socket.laddr}, Remote Address: {listening_socket.raddr}, Status: {listening_socket.status}"
+        )
+def installed_packages(logger):
+    """Get installed packages.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Checking installed packages")
+    import importlib_metadata
+    packages = importlib_metadata.distributions()
+    installed_pkg = sorted([f"{package.metadata['Name']}=={package.version} ({package._path})" for package in packages])
+    installed_pkg_str = "\n[\n\t" + ",\n\t".join(installed_pkg) + "\n]"
+    logger.info(installed_pkg_str)
+def check_compiler_and_clib(logger):
+    """Check compiler and C libraries.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Checking compiler and C libraries")
+    with ScriptThread(["gcc", "--version"], name="gcc_version") as gcc_version_thread:
+        gcc_version_thread.join()
+        logger.info("GCC version:")
+        logger.info(gcc_version_thread.output)
+        if gcc_version_thread.returncode != 0:
+            logger.error("gcc failed")
+    logger.info("Python version:")
+    logger.info(sys.version)
+    try:
+        logger.info(os.confstr("CS_GNU_LIBC_VERSION"))
+    except AttributeError as e:
+        logger.error(f"Failed to get glibc version {e}")
+def log_env_variables(logger):
+    """Log environment variables.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Environment variables")
+    env_vars = os.environ.items()
+    blacklist_patterns = [
+        r".*token.*",
+        r".*secret.*",
+        r".*key.*",
+        r".*password.*",
+    ]
+    patterns = [re.compile(pattern, re.IGNORECASE) for pattern in blacklist_patterns]
+    filtered_env_vars = [
+        f"{key}={value}"
+        for key, value in env_vars
+        if not any(pattern.search(key) or pattern.search(value) for pattern in patterns)
+    ]
+    env_vars_str = "\n".join(filtered_env_vars)
+    logger.info(env_vars_str)
+def env_checks(logger: logging.Logger):
+    """Run all environment checks.
+    Args:
+        logger: logger instance
+    """
+    logger.info("Running all environment checks")
+    get_platform_info(logger)
+    nvidia_smi(logger)
+    installed_packages(logger)
+    check_psutil_stats(logger)
+    get_listening_processes(logger)
+    check_compiler_and_clib(logger)
+    log_env_variables(logger)

stf/stf-api-alternative/pytriton/build/lib/pytriton/check/utils.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils."""
+import contextlib
+import fcntl
+import logging
+import os
+import pathlib
+import re
+import select
+import socket
+import subprocess
+import threading
+import typing
+LOGGER = logging.getLogger(__name__)
+DEFAULT_LOG_FORMAT = "%(asctime)s - %(levelname)8s - %(process)8d - %(threadName)s - %(name)s: %(message)s"
+def _read_outputs(_process, _logger, _outputs):
+    # Set stdout and stderr file descriptors to non-blocking mode
+    try:
+        fcntl.fcntl(_process.stdout, fcntl.F_SETFL, os.O_NONBLOCK)
+        fcntl.fcntl(_process.stderr, fcntl.F_SETFL, os.O_NONBLOCK)
+    except ValueError:  # when selecting on closed files
+        return
+    buffers = {_process.stdout: "", _process.stderr: ""}
+    rds = [_process.stdout, _process.stderr]
+    while rds:
+        try:
+            readable, _, _ = select.select(rds, [], [], 1)
+        except ValueError:  # when selecting on closed files
+            break
+        for rd in readable:
+            try:
+                data = os.read(rd.fileno(), 4096)
+                if not data:
+                    rds.remove(rd)
+                    continue
+                decoded_data = data.decode("utf-8")
+                buffers[rd] += decoded_data
+                lines = buffers[rd].splitlines(keepends=True)
+                if buffers[rd].endswith("\n"):
+                    complete_lines = lines
+                    buffers[rd] = ""
+                else:
+                    complete_lines = lines[:-1]
+                    buffers[rd] = lines[-1]
+                for line in complete_lines:
+                    line = line.rstrip()
+                    _logger.info(line)
+                    _outputs.append(line)
+            except OSError:  # Reading from an empty non-blocking file
+                pass
+class ScriptThread(threading.Thread):
+    """A class that runs external script in a separate thread."""
+    def __init__(self, cmd, workdir=None, group=None, target=None, name=None, args=(), kwargs=None) -> None:
+        """Initializes the ScriptThread object."""
+        super().__init__(group, target, name, args, kwargs, daemon=True)
+        self.cmd = cmd
+        self.workdir = workdir
+        self._process_spawned_or_spawn_error_flag = None
+        self.active = False
+        self._process = None
+        self.returncode = None
+        self._output = []
+        self._logger = logging.getLogger(self.name)
+    def __enter__(self):
+        """Starts the script thread."""
+        self.start(threading.Event())
+        self._process_spawned_or_spawn_error_flag.wait()
+        return self
+    def __exit__(self, *args):
+        """Stops the script thread and waits for it to join."""
+        self.stop()
+        self.join()
+        self._process_spawned_or_spawn_error_flag = None
+    def start(self, flag: typing.Optional[threading.Event] = None) -> None:
+        """Starts the script thread."""
+        if flag is None:
+            flag = threading.Event()
+        self._logger.info(f"Starting {self.name} script with \"{' '.join(self.cmd)}\" cmd")
+        self._process_spawned_or_spawn_error_flag = flag
+        super().start()
+    def stop(self):
+        """Sets the active flag to False to stop the script thread."""
+        self._logger.info(f"Stopping {self.name} script")
+        self.active = False
+    def run(self):
+        """Runs the script in a separate process."""
+        import psutil
+        self.returncode = None
+        self._output = []
+        self._process = None
+        os.environ.setdefault("PYTHONUNBUFFERED", "1")  # to not buffer logs
+        try:
+            with psutil.Popen(
+                self.cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, cwd=self.workdir
+            ) as process:
+                self._process = process
+                self.active = True
+                if self._process_spawned_or_spawn_error_flag:
+                    self._process_spawned_or_spawn_error_flag.set()
+                while self.active and process.poll() is None and process.returncode is None:
+                    try:
+                        _read_outputs(process, self._logger, self._output)
+                    except KeyboardInterrupt:
+                        self.stop()
+        finally:
+            if self._process_spawned_or_spawn_error_flag:
+                self._process_spawned_or_spawn_error_flag.set()
+            if self.process:
+                while self.process.poll() is None:
+                    _read_outputs(self.process, self._logger, self._output)
+                _read_outputs(self.process, self._logger, self._output)
+                self.returncode = process.wait()  # pytype: disable=name-error
+                self._logger.info(f"{self.name} process finished with {self.returncode}")
+            self.active = False
+            self._process = None
+    @property
+    def output(self):
+        """Return process stream output."""
+        return "\n".join(self._output)
+    @property
+    def process(self):
+        """Return process object."""
+        return self._process
+def find_free_port() -> int:
+    """Finds a free port on the local machine."""
+    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+class ProcessMonitoring:
+    """A class that dumps the state of a process and its children.
+    This class uses the py-spy tool to dump the stack trace of a process and its
+    children recursively. It also dumps the process information such as the parent
+    and the command line. It allows registering custom monitors that can perform
+    additional actions on the process.
+    Attributes:
+        _logger (logging.Logger): The logger object to write messages.
+        _process (psutil.Process): The process object to monitor.
+        _children_processes (list[psutil.Process]): The list of child processes to monitor.
+        _log (logging.Logger.method): The logging method to use for messages.
+        _remove_color (bool): Whether to remove ANSI escape sequences from the output.
+        _ansi_escape (re.Pattern): The regular expression object to match ANSI escape sequences.
+        _custom_monitors (list[typing.Callable[[int], None]]): The list of custom monitor functions to execute on each dump cycle.
+    """
+    def __init__(
+        self,
+        pid: int,
+        logger: typing.Optional[logging.Logger] = None,
+        loglevel: int = logging.INFO,
+        remove_color: bool = False,
+    ):
+        """Initializes the ProcessMonitoring object.
+        Args:
+            pid (int): The process ID of the process to monitor.
+            logger (typing.Optional[logging.Logger], optional): The logger object to write messages. Defaults to None.
+            loglevel (int, optional): The logging level to use for messages. Defaults to logging.INFO.
+            remove_color (bool, optional): Whether to remove ANSI escape sequences from the output. Defaults to False.
+        """
+        import re
+        import psutil
+        self._logger = logger or logging.getLogger("monitoring")
+        self._process = psutil.Process(pid)
+        self._children_processes = list(self._process.children(recursive=True))
+        self._log = {
+            logging.DEBUG: self._logger.debug,
+            logging.INFO: self._logger.info,
+            logging.WARNING: self._logger.warning,
+            logging.ERROR: self._logger.error,
+        }[loglevel]
+        self._log(f"Initial list of children processes: {self._children_processes}")
+        self._remove_color = remove_color
+        pattern = r"\x1b\[.*?m"
+        self._ansi_escape = re.compile(pattern)
+        self._custom_monitors = []
+    def register_custom_monitor(self, custom_monitor: typing.Callable[[int], None]) -> None:
+        """Registers a custom monitor for the process.
+        This method adds a custom monitor function to the list of monitors that are
+        executed on each dump cycle. A custom monitor function should take an integer
+        as an argument (the process ID) and return None.
+        Args:
+            custom_monitor (typing.Callable[[int], None]): The custom monitor function to register.
+        """
+        self._custom_monitors.append(custom_monitor)
+    def dump_state(self) -> None:
+        """Dumps the state of the process and its children.
+        This method calls the _dump_processes_stacktrace and _dump_child_processes
+        methods to dump the stack trace and the process information of the process
+        and its children recursively.
+        """
+        self._dump_processes_stacktrace()
+        self._dump_child_processes()
+    def _dump_processes_stacktrace(self):
+        import psutil
+        import sh
+        self._log("==== Dump process stacktrace")
+        pyspy_cmd = sh.Command("py-spy")
+        for process in [self._process] + self.children:
+            try:
+                result = pyspy_cmd("dump", "-ll", "--nonblocking", "-p", str(process.pid))
+                if self._remove_color:
+                    result = self._ansi_escape.sub("", str(result))
+                self._log(f"Dump stack trace for process (pid={process.pid}) with cmd {process.cmdline()}")
+                for custom_monitor in self._custom_monitors:
+                    custom_monitor(process.pid)
+                self._log(result)
+            except psutil.NoSuchProcess as e:
+                self._log(f"Error during handling process: {e}")
+            except sh.ErrorReturnCode_1 as e:
+                self._log(f"Error during calling py-spy process: {e}")
+    def _dump_child_processes(self):
+        import psutil
+        self._log("==== Dump process info (with its children)")
+        for process in [self._process] + self.children:
+            try:
+                self._log(f"{process} parent={process.parent()} ")
+            except psutil.NoSuchProcess:
+                self._log(f"{process} is missing in process table")
+    @property
+    def children(self):
+        """Returns the list of child processes to monitor.
+        This property returns the list of child processes to monitor, and updates it
+        with any new children that are created by the process.
+        Returns:
+            list[psutil.Process]: The list of child processes to monitor.
+        """
+        import psutil
+        try:
+            children = list(self._process.children(recursive=True))
+            self._children_processes = list(set(self._children_processes + children))
+        except psutil.NoSuchProcess:
+            pass
+        return self._children_processes
+def get_current_container_version():
+    """Returns the version of the current container."""
+    container_version = os.environ.get("NVIDIA_PYTORCH_VERSION") or os.environ.get("NVIDIA_TENSORFLOW_VERSION")
+    if container_version and "-" in container_version:
+        container_version = container_version.split("-")[0]  # TF version has format <year_month_version>-<tf_version>
+    return container_version
+def verify_docker_image_in_readme_same_as_tested(readme_path, image_name_with_version):
+    """Verify that the docker image is the same as described in the readme file."""
+    image_name, _image_version = image_name_with_version.split(":")
+    framework_name = image_name.split("/")[-1]
+    readme_payload = pathlib.Path(readme_path).read_text()
+    match_iterator = re.finditer(
+        rf"(?P<container_registry>[\w/.\-:]+)/{framework_name}:(?P<image_version_with_python_version>[\w.-]+)",
+        readme_payload,
+    )
+    for entry in match_iterator:
+        assert entry.group() == image_name_with_version, f"{entry.group()} != {image_name_with_version}"
+def search_warning_on_too_verbose_log_level(logs: str):
+    """Search warnings."""
+    pattern = r"Triton Inference Server is running with enabled verbose logs.*It may affect inference performance."
+    return re.search(pattern, logs)
+class ProcessMonitoringThread:
+    """A class that creates a thread to monitor a process.
+    This class uses the ProcessMonitoring class to dump the state of a process
+    and its children periodically. It also allows registering custom monitors
+    that can perform additional actions on the process.
+    Attributes:
+        _monitoring (ProcessMonitoring): The ProcessMonitoring object that handles the dumping logic.
+        _stop_event (threading.Event): The event object that signals the thread to stop its loop.
+        _thread (threading.Thread): The thread object that runs the _run method in a loop.
+        _interval (float): The interval in seconds between each dump cycle.
+    """
+    def __init__(self, monitoring: ProcessMonitoring, interval: float = 60):
+        """Initializes the ProcessMonitoringThread object.
+        Args:
+            monitoring (ProcessMonitoring): The ProcessMonitoring object that handles the dumping logic.
+            interval (float, optional): The interval in seconds between each dump cycle. Defaults to 60.
+        """
+        self._monitoring = monitoring
+        self._interval = interval
+    def start(self) -> None:
+        """Starts the monitoring thread.
+        This method creates a new thread that runs the _run method in a loop until
+        the stop method is called or an exception occurs. It also sets the stop event
+        object that can be used to signal the thread to stop gracefully.
+        """
+        self._stop_event = threading.Event()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+        self._thread.start()
+    def stop(self) -> None:
+        """Stops the monitoring thread.
+        This method sets the stop event object that signals the thread to stop its loop.
+        It also waits for the thread to join before returning.
+        """
+        self._stop_event.set()
+        self._thread.join()
+    def __enter__(self):
+        """Enters the context manager for the monitoring thread."""
+        self.start()
+        return self
+    def __exit__(self, *args):
+        """Exits the context manager for the monitoring thread."""
+        self.stop()
+    def _run(self):
+        logging.info("Monitoring process")
+        self._monitoring.dump_state()
+        while not self._stop_event.wait(self._interval):
+            logging.info("Monitoring process")
+            self._monitoring.dump_state()
+class TestMonitoringContext:
+    """A context manager that monitors test processes.
+    This context manager creates threads to monitor the test processes and dumps
+    their state periodically. It can extend argparse args with additional arguments.
+    It supports splitting log into different files. The standard output log can have one level
+    and the file log can have another level. It uses log rotation.
+    """
+    @staticmethod
+    def extend_args(parser):
+        """Extends argparse args with additional arguments."""
+        parser.add_argument(
+            "--verbose",
+            action="store_true",
+            help="Provide verbose logs",
+        )
+        parser.add_argument(
+            "--log-path",
+            type=str,
+            default=None,
+            help="Provide the path of external log for rotation",
+        )
+        parser.add_argument(
+            "--compress-logs",
+            action="store_true",
+            help="Enable logs compression",
+        )
+        parser.add_argument(
+            "--maximum-log-file",
+            type=int,
+            default=10 * 1024 * 1024,
+            help="Maximum logfile size before rotation is started",
+            required=False,
+        )
+        parser.add_argument(
+            "--enable-fault-handler",
+            action="store_true",
+            help="Enable faulthandler",
+        )
+        parser.add_argument(
+            "--faulthandler-interval",
+            type=float,
+            default=None,
+            help="Enable faulthandler after specified number of seconds with repeat",
+            required=False,
+        )
+        parser.add_argument(
+            "--process-monitoring-interval",
+            type=float,
+            default=None,
+            help="Enable process monitoring after specified number of seconds with repeat",
+            required=False,
+        )
+    def __init__(self, args):
+        """Initializes the TestMonitoringContext object.
+        Args:
+            args (argparse.Namespace): The argparse args object to extend with additional arguments.
+        """
+        self._args = args
+    def __enter__(self):
+        """Enters the context manager for the test monitoring."""
+        import faulthandler
+        import logging.handlers
+        args = self._args
+        self._loglevel = log_level = logging.DEBUG if args.verbose else logging.INFO
+        logging.basicConfig(level=logging.DEBUG, format=DEFAULT_LOG_FORMAT)
+        logger = logging.getLogger()
+        if args.log_path is not None:
+            # Create a rotating file handler for the file output logger
+            # The file name is based on the log path argument, the maximum size is 10 MB, and the maximum number of files is 500
+            file_handler = logging.handlers.RotatingFileHandler(
+                args.log_path, maxBytes=args.maximum_log_file, backupCount=500
+            )
+            file_handler.setFormatter(logging.Formatter(DEFAULT_LOG_FORMAT))
+            file_handler.setLevel(logging.DEBUG)
+            if args.compress_logs:
+                file_handler.namer = lambda name: name + ".gz"
+                def gzip_rotation(source, dest):
+                    import gzip
+                    import os
+                    with open(source, "rb") as f_in:
+                        with gzip.open(dest, "wb") as f_out:
+                            f_out.writelines(f_in)
+                    os.remove(source)
+                file_handler.rotator = gzip_rotation
+            # Add the file handler to the default logger
+            logger.addHandler(file_handler)
+            # Get the stream handler that was created by basicConfig
+            # Get the stream handler that was created by basicConfig
+            stream_handler = logger.handlers[0]
+            # Set the stream handler's level to match the log level argument
+            stream_handler.setLevel(log_level)
+            if args.enable_fault_handler:
+                faulthandler.enable()
+            if args.faulthandler_interval is not None:
+                faulthandler.dump_traceback_later(args.faulthandler_interval, repeat=True, exit=False)
+            custom_monitors = []
+            import os
+            import psutil
+            def monitor_ram_usage(pid=None):
+                if pid is None:
+                    pid = os.getpid()
+                process = psutil.Process(pid)
+                logger.debug(f"MONITOR RAM USAGE ({pid}): {process.memory_info()}")
+            custom_monitors.append(monitor_ram_usage)
+            def monitor_file_descriptors(pid=None):
+                if pid is None:
+                    pid = os.getpid()
+                process = psutil.Process(pid)
+                logger.debug(f"MONITOR FILE DESCRIPTORS ({pid}): {process.num_fds()}")
+            custom_monitors.append(monitor_file_descriptors)
+            def monitor_cpu_usage(pid=None):
+                if pid is None:
+                    pid = os.getpid()
+                process = psutil.Process(pid)
+                logger.debug(f"MONITOR CPU USAGE ({pid}): {process.cpu_percent()}")
+            custom_monitors.append(monitor_cpu_usage)
+            def monitor_threads(pid=None):
+                if pid is None:
+                    pid = os.getpid()
+                process = psutil.Process(pid)
+                logger.debug(f"MONITOR THREADS ({pid}): {process.num_threads()}")
+            custom_monitors.append(monitor_threads)
+            def monitor_process_dict(pid=None):
+                if pid is None:
+                    pid = os.getpid()
+                process = psutil.Process(pid)
+                logger.debug(f"MONITOR PROCESS DICT ({pid}): {process.as_dict()}")
+            custom_monitors.append(monitor_process_dict)
+        if args.process_monitoring_interval is not None:
+            monitoring = ProcessMonitoring(os.getpid(), logger, loglevel=logging.DEBUG, remove_color=True)
+            for monitor in custom_monitors:
+                monitoring.register_custom_monitor(monitor)
+            self._monitor = ProcessMonitoringThread(monitoring, interval=args.process_monitoring_interval)
+            self._monitor.start()
+        return self
+    def __exit__(self, *args):
+        """Stops the monitor thread."""
+        if hasattr(self, "_monitor"):
+            self._monitor.stop()
+            self._monitor = None

stf/stf-api-alternative/pytriton/build/lib/pytriton/client/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104
+from .client import (
+    AsyncioDecoupledModelClient,  # noqa: F401
+    AsyncioModelClient,  # noqa: F401
+    DecoupledModelClient,  # noqa: F401
+    FuturesModelClient,  # noqa: F401
+    ModelClient,  # noqa: F401
+)

stf/stf-api-alternative/pytriton/build/lib/pytriton/client/asyncio_utils.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility module supporting model clients."""
+import asyncio
+import logging
+import time
+from typing import Optional, Union
+import aiohttp
+import grpc
+import tritonclient.grpc
+import tritonclient.http
+from pytriton.client.exceptions import PyTritonClientModelUnavailableError, PyTritonClientTimeoutError
+from pytriton.client.utils import LATEST_MODEL_VERSION, ModelState, parse_grpc_response, parse_http_response
+from pytriton.model_config.parser import ModelConfigParser
+aio_clients = Union[tritonclient.grpc.aio.InferenceServerClient, tritonclient.http.aio.InferenceServerClient]
+_LOGGER = logging.getLogger(__name__)
+_DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S = 60.0  # 60 seconds
+_DEFAULT_ASYNC_SLEEP_FACTOR_S = 0.1  # 10% of timeout
+async def asyncio_get_model_state(
+    client: aio_clients,
+    model_name: str,
+    model_version: Optional[str] = None,
+) -> ModelState:
+    """Obtains state of the model deployed in Triton Inference Server.
+    Typical use:
+    >>> import tritonclient.http.aio
+    ... client = tritonclient.http.aio.InferenceServerClient("localhost:8000")
+    ... model_state = await get_model_state(client, "MyModel", "1")
+    Args:
+        client: Triton Inference Server client to use for communication
+        model_name: name of the model which state we're requesting.
+        model_version:
+            version of the model which state we're requesting.
+            If model_version is None state of latest model is returned.
+            The latest versions of the model are the numerically greatest version numbers.
+    Returns:
+        Model state. ModelState.UNAVAILABLE is returned in case if model with given name and version is not found.
+    """
+    _LOGGER.debug(f"Obtaining model {model_name} state")
+    repository_index = await client.get_model_repository_index()
+    _LOGGER.debug("Model repository index obtained")
+    if isinstance(repository_index, list):
+        models_states = parse_http_response(models=repository_index)
+    else:
+        models_states = parse_grpc_response(models=repository_index.models)
+    if model_version is None:
+        requested_model_states = {
+            version: state for (name, version), state in models_states.items() if name == model_name
+        }
+        if not requested_model_states:
+            return ModelState.UNAVAILABLE
+        else:
+            requested_model_states = sorted(requested_model_states.items(), key=lambda item: int(item[0]))
+            latest_version, latest_version_state = requested_model_states[-1]
+            _LOGGER.debug(f"Model {model_name} latest version: {latest_version} state: {latest_version_state}")
+            return latest_version_state
+    else:
+        key = (model_name, model_version)
+        if key not in models_states:
+            return ModelState.UNAVAILABLE
+        else:
+            model_state = models_states[key]
+            _LOGGER.debug(f"Model {model_name} version {model_version} state: {model_state}")
+            return model_state
+async def asyncio_get_model_config(
+    client: aio_clients,
+    model_name: str,
+    model_version: Optional[str] = None,
+    timeout_s: float = _DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S,
+):
+    """Obtain configuration of model deployed on the Triton Inference Server.
+    Function waits for server readiness.
+    Args:
+        client: Triton Inference Server client to use for communication
+        model_name: name of the model which configuration we're requesting.
+        model_version:
+            version of the model which configuration we're requesting.
+            If model_version is None configuration of the latest model is returned.
+            The latest versions of the model are the numerically greatest version numbers.
+        timeout_s: timeout to finish model configuration obtain.
+    Returns:
+        Configuration of requested model.
+    Raises:
+        PyTritonClientTimeoutError: If obtain of model configuration didn't finish before given timeout.
+        PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+    """
+    should_finish_before = time.time() + timeout_s
+    _LOGGER.debug(f"Obtaining model {model_name} config (timeout={timeout_s:0.2f})")
+    try:
+        _LOGGER.debug(f"Waiting for model {model_name} to be ready")
+        await asyncio.wait_for(
+            asyncio_wait_for_model_ready(
+                client, model_name=model_name, model_version=model_version, timeout_s=timeout_s
+            ),
+            timeout_s,
+        )
+        model_version = model_version or ""
+        timeout_s = max(0, should_finish_before - time.time())
+        if isinstance(client, tritonclient.grpc.aio.InferenceServerClient):
+            _LOGGER.debug(f"Obtaining model {model_name} config as_json=True")
+            response = await asyncio.wait_for(
+                client.get_model_config(model_name, model_version, as_json=True), timeout_s
+            )
+            model_config = response["config"]
+        else:
+            _LOGGER.debug(f"Obtaining model {model_name} config")
+            model_config = await asyncio.wait_for(client.get_model_config(model_name, model_version), timeout_s)
+        _LOGGER.debug("Model config obtained")
+        model_config = ModelConfigParser.from_dict(model_config)
+        _LOGGER.debug(f"Model config: {model_config}")
+        return model_config
+    except asyncio.TimeoutError as e:
+        message = f"Timeout while waiting for model {model_name} config (timeout={timeout_s:0.2f})"
+        _LOGGER.error(message)
+        raise PyTritonClientTimeoutError(message) from e
+async def asyncio_wait_for_server_ready(
+    asyncio_client: aio_clients,
+    sleep_time_s: float,
+):
+    """Wait for Triton Inference Server readiness.
+    There are two functions, which check server status:
+    * asyncio_client.is_server_ready()
+    * asyncio_client.is_server_live()
+    Both must return true to consider server accessible to read model status.
+    Function contains while loop with sleep to check server status periodically.
+    Args:
+        asyncio_client: Triton Inference Server client to use for communication
+        sleep_time_s: time to sleep between server status checks
+    Raises:
+        PyTritonClientTimeoutError: If obtain of model configuration didn't finish before given timeout.
+    """
+    _LOGGER.debug("Waiting for server to be ready")
+    try:
+        while True:
+            try:
+                _LOGGER.debug("Waiting for server to be ready")
+                server_ready = await asyncio_client.is_server_ready()
+                _LOGGER.debug("Waiting for server to be live")
+                server_live = await asyncio_client.is_server_live()
+            except tritonclient.utils.InferenceServerException:
+                # Raised by tritonclient/grpc/__init__.py:75
+                server_live = False
+                server_ready = False
+            except aiohttp.client_exceptions.ClientConnectorError:
+                # This exception is raised by aiohttp/connector.py:901 in _create_direct_connection
+                # and it is not translated to any other error by tritonclient/http/aio/__init__.py:132 in _get method.
+                #    res = await self._stub.get(url=req_url,
+                # and tritonclient/http/aio/__init__.py:242 in is_server_ready method.
+                #    response = await self._get(request_uri=request_uri,
+                server_live = False
+                server_ready = False
+            except RuntimeError:
+                # This exception is raised by aiohttp/client.py:400 in _request
+                # and it is not translated to any other error by tritonclient/grpc/aio/__init__.py:151: in is_server_ready method.
+                #    response = await self._client_stub.ServerReady(request=request,
+                server_live = False
+                server_ready = False
+            except grpc._cython.cygrpc.UsageError:
+                # This exception is raised by grpcio/grpc/_cython/_cygrpc/aio/channel.pyx.pxi:124
+                # and it is not translated to any other error by tritonclient/grpc/aio/__init__.py", line 151, in is_server_ready
+                #   response = await self._client_stub.ServerReady(request=request,
+                server_live = False
+                server_ready = False
+            if server_ready and server_live:
+                break
+            _LOGGER.debug(f"Sleeping for {sleep_time_s:0.2f} seconds")
+            await asyncio.sleep(sleep_time_s)
+    except asyncio.TimeoutError as e:
+        # This error is caused by our timeout, not by Triton Inference Server client.
+        message = "Timeout while waiting for model"
+        _LOGGER.error(message)
+        raise PyTritonClientTimeoutError(message) from e
+    _LOGGER.debug("Server is ready")
+async def asyncio_wait_for_model_status_loaded(
+    asyncio_client: aio_clients,
+    model_name: str,
+    sleep_time_s: float,
+    model_version: Optional[str] = None,
+):
+    """Wait for model status loaded.
+    Function runs the following async function to check model status:
+    ```python
+        asyncio_get_model_state(asyncio_client, model_name, model_version)
+    ```
+    If it return _ModelState.READY, then another async function can check if model is really ready:
+    ```python
+        asyncio_client.is_model_ready(model_name)
+    ```
+    This function uses the above functions to check if model is ready together
+    with asyncio.wait_for(...) to limit the time of waiting.
+    Function contains while loop with sleep to check model status periodically.
+    Args:
+        asyncio_client: Triton Inference Server client to use for communication
+        model_name: name of the model which configuration we're requesting.
+        model_version:
+            version of the model which configuration we're requesting.
+            If model_version is None configuration of the latest model is returned.
+            The latest versions of the model are the numerically greatest version numbers.
+        sleep_time_s: time interval, in seconds, between successive checks to determine if the model configuration has been completed.
+    Raises:
+        PyTritonClientTimeoutError: If obtain of model configuration didn't finish before given timeout.
+    """
+    model_version = model_version or ""
+    model_version_msg = model_version or LATEST_MODEL_VERSION
+    _LOGGER.debug(f"Waiting for model {model_name}, {model_version_msg} to be ready")
+    try:
+        while True:
+            _LOGGER.debug(f"Checking if model {model_name} is ready")
+            is_model_ready = await asyncio_client.is_model_ready(model_name, model_version)
+            if is_model_ready:
+                break
+            _LOGGER.debug(f"Sleeping for {sleep_time_s} seconds")
+            await asyncio.sleep(sleep_time_s)
+    except asyncio.TimeoutError as e:
+        message = f"Timeout while waiting for model {model_name} state (timeout={sleep_time_s:0.2f})"
+        _LOGGER.error(message)
+        raise PyTritonClientTimeoutError(message) from e
+    _LOGGER.debug(f"Model {model_name}, {model_version_msg} is ready")
+async def asyncio_wait_for_model_ready(
+    asyncio_client: aio_clients,
+    model_name: str,
+    model_version: Optional[str] = None,
+    timeout_s: float = _DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S,
+):
+    """Wait for Triton Inference Server and deployed on it model readiness.
+    Args:
+        asyncio_client: Triton Inference Server client to use for communication
+        model_name: name of the model which configuration we're requesting.
+        model_version:
+            version of the model which configuration we're requesting.
+            If model_version is None configuration of the latest model is returned.
+            The latest versions of the model are the numerically greatest version numbers.
+        timeout_s: timeout to finish model configuration obtain.
+    Raises:
+        PyTritonClientTimeoutError: If obtain of model configuration didn't finish before given timeout.
+    """
+    _LOGGER.debug(f"Waiting for model {model_name} to be ready (timeout={timeout_s:0.2f})")
+    sleep_time_s = timeout_s * _DEFAULT_ASYNC_SLEEP_FACTOR_S
+    try:
+        should_finish_before = time.time() + timeout_s
+        await asyncio.wait_for(asyncio_wait_for_server_ready(asyncio_client, sleep_time_s), timeout_s)
+        _LOGGER.debug(f"Waiting for model {model_name} to be ready")
+        timeout_s = max(0, should_finish_before - time.time())
+        await asyncio.wait_for(
+            asyncio_wait_for_model_status_loaded(
+                asyncio_client, model_name=model_name, model_version=model_version, sleep_time_s=sleep_time_s
+            ),
+            timeout_s,
+        )
+    except PyTritonClientModelUnavailableError as e:
+        _LOGGER.error(f"Failed to obtain model {model_name} config error {e}")
+        raise e
+    except asyncio.TimeoutError as e:
+        _LOGGER.error(f"Failed to obtain model {model_name} config error {e}")
+        raise PyTritonClientTimeoutError(
+            f"Timeout while waiting for model {model_name} to be ready (timeout={timeout_s:0.2f})"
+        ) from e
+    _LOGGER.debug(f"Model {model_name} is ready")

stf/stf-api-alternative/pytriton/build/lib/pytriton/client/client.py ADDED Viewed

	@@ -0,0 +1,2033 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Clients for easy interaction with models deployed on the Triton Inference Server.
+Typical usage example:
+```python
+client = ModelClient("localhost", "MyModel")
+result_dict = client.infer_sample(input_a=a, input_b=b)
+client.close()
+```
+Inference inputs can be provided either as positional or keyword arguments:
+```python
+result_dict = client.infer_sample(input1, input2)
+result_dict = client.infer_sample(a=input1, b=input2)
+```
+Mixing of argument passing conventions is not supported and will raise PyTritonClientValueError.
+"""
+import asyncio
+import contextlib
+import itertools
+import logging
+import socket
+import time
+import warnings
+from concurrent.futures import Future
+from queue import Empty, Full, Queue
+from threading import Lock, Thread
+from typing import Any, Dict, Optional, Tuple, Union
+import gevent
+import numpy as np
+import tritonclient.grpc
+import tritonclient.grpc.aio
+import tritonclient.http
+import tritonclient.http.aio
+import tritonclient.utils
+from pytriton.client.asyncio_utils import asyncio_get_model_config, asyncio_wait_for_model_ready
+from pytriton.client.exceptions import (
+    PyTritonClientClosedError,
+    PyTritonClientInferenceServerError,
+    PyTritonClientModelDoesntSupportBatchingError,
+    PyTritonClientQueueFullError,
+    PyTritonClientTimeoutError,
+    PyTritonClientValueError,
+)
+from pytriton.client.utils import (
+    _DEFAULT_NETWORK_TIMEOUT_S,
+    _DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S,
+    TritonUrl,
+    get_model_config,
+    wait_for_model_ready,
+    wait_for_server_ready,
+)
+from pytriton.client.warnings import NotSupportedTimeoutWarning
+from pytriton.model_config.triton_model_config import TritonModelConfig
+_LOGGER = logging.getLogger(__name__)
+_DEFAULT_SYNC_INIT_TIMEOUT_S = _DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S
+_DEFAULT_FUTURES_INIT_TIMEOUT_S = _DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S
+DEFAULT_INFERENCE_TIMEOUT_S = 60.0
+_IOType = Union[Tuple[np.ndarray, ...], Dict[str, np.ndarray]]
+def _verify_inputs_args(inputs, named_inputs):
+    if not inputs and not named_inputs:
+        raise PyTritonClientValueError("Provide input data")
+    if not bool(inputs) ^ bool(named_inputs):
+        raise PyTritonClientValueError("Use either positional either keyword method arguments convention")
+def _verify_parameters(parameters_or_headers: Optional[Dict[str, Union[str, int, bool]]] = None):
+    if parameters_or_headers is None:
+        return
+    if not isinstance(parameters_or_headers, dict):
+        raise PyTritonClientValueError("Parameters and headers must be a dictionary")
+    for key, value in parameters_or_headers.items():
+        if not isinstance(key, str):
+            raise PyTritonClientValueError("Parameter/header key must be a string")
+        if not isinstance(value, (str, int, bool)):
+            raise PyTritonClientValueError("Parameter/header value must be a string, integer or boolean")
+class BaseModelClient:
+    """Base client for model deployed on the Triton Inference Server."""
+    def __init__(
+        self,
+        url: str,
+        model_name: str,
+        model_version: Optional[str] = None,
+        *,
+        lazy_init: bool = True,
+        init_timeout_s: Optional[float] = None,
+        inference_timeout_s: Optional[float] = None,
+        model_config: Optional[TritonModelConfig] = None,
+        ensure_model_is_ready: bool = True,
+    ):
+        """Inits BaseModelClient for given model deployed on the Triton Inference Server.
+        Common usage:
+        ```python
+        client = ModelClient("localhost", "BERT")
+        result_dict = client.infer_sample(input1_sample, input2_sample)
+        client.close()
+        ```
+        Args:
+            url: The Triton Inference Server url, e.g. `grpc://localhost:8001`.
+                In case no scheme is provided http scheme will be used as default.
+                In case no port is provided default port for given scheme will be used -
+                8001 for grpc scheme, 8000 for http scheme.
+            model_name: name of the model to interact with.
+            model_version: version of the model to interact with.
+                If model_version is None inference on latest model will be performed.
+                The latest versions of the model are numerically the greatest version numbers.
+            lazy_init: if initialization should be performed just before sending first request to inference server.
+            init_timeout_s: timeout in seconds for the server and model to be ready. If not passed, the default timeout of 300 seconds will be used.
+            inference_timeout_s: timeout in seconds for a single model inference request. If not passed, the default timeout of 60 seconds will be used.
+            model_config: model configuration. If not passed, it will be read from inference server during initialization.
+            ensure_model_is_ready: if model should be checked if it is ready before first inference request.
+        Raises:
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientTimeoutError:
+                if `lazy_init` argument is False and wait time for server and model being ready exceeds `init_timeout_s`.
+            PyTritonClientInvalidUrlError: If provided Triton Inference Server url is invalid.
+        """
+        self._init_timeout_s = _DEFAULT_SYNC_INIT_TIMEOUT_S if init_timeout_s is None else init_timeout_s
+        self._inference_timeout_s = DEFAULT_INFERENCE_TIMEOUT_S if inference_timeout_s is None else inference_timeout_s
+        self._network_timeout_s = min(_DEFAULT_NETWORK_TIMEOUT_S, self._init_timeout_s)
+        self._general_client = self.create_client_from_url(url, network_timeout_s=self._network_timeout_s)
+        self._infer_client = self.create_client_from_url(url, network_timeout_s=self._inference_timeout_s)
+        self._model_name = model_name
+        self._model_version = model_version
+        self._request_id_generator = itertools.count(0)
+        # Monkey patch __del__ method from client to catch error in client when instance is garbage collected.
+        # This is needed because we are closing client in __exit__ method or in close method.
+        # (InferenceClient uses gevent library which does not support closing twice from different threads)
+        self._monkey_patch_client()
+        if model_config is not None:
+            self._model_config = model_config
+            self._model_ready = None if ensure_model_is_ready else True
+        else:
+            self._model_config = None
+            self._model_ready = None
+        self._lazy_init: bool = lazy_init
+        self._handle_lazy_init()
+    @classmethod
+    def from_existing_client(cls, existing_client: "BaseModelClient"):
+        """Create a new instance from an existing client using the same class.
+        Common usage:
+        ```python
+        client = BaseModelClient.from_existing_client(existing_client)
+        ```
+        Args:
+            existing_client: An instance of an already initialized subclass.
+        Returns:
+            A new instance of the same subclass with shared configuration and readiness state.
+        """
+        kwargs = {}
+        # Copy model configuration and readiness state if present
+        if hasattr(existing_client, "_model_config"):
+            kwargs["model_config"] = existing_client._model_config
+            kwargs["ensure_model_is_ready"] = False
+        new_client = cls(
+            url=existing_client._url,
+            model_name=existing_client._model_name,
+            model_version=existing_client._model_version,
+            init_timeout_s=existing_client._init_timeout_s,
+            inference_timeout_s=existing_client._inference_timeout_s,
+            **kwargs,
+        )
+        return new_client
+    def create_client_from_url(self, url: str, network_timeout_s: Optional[float] = None):
+        """Create Triton Inference Server client.
+        Args:
+            url: url of the server to connect to.
+                If url doesn't contain scheme (e.g. "localhost:8001") http scheme is added.
+                If url doesn't contain port (e.g. "localhost") default port for given scheme is added.
+            network_timeout_s: timeout for client commands. Default value is 60.0 s.
+        Returns:
+            Triton Inference Server client.
+        Raises:
+            PyTritonClientInvalidUrlError: If provided Triton Inference Server url is invalid.
+        """
+        self._triton_url = TritonUrl.from_url(url)
+        self._url = self._triton_url.without_scheme
+        self._triton_client_lib = self.get_lib()
+        self._monkey_patch_client()
+        if self._triton_url.scheme == "grpc":
+            # by default grpc client has very large number of timeout, thus we want to make it equal to http client timeout
+            network_timeout_s = _DEFAULT_NETWORK_TIMEOUT_S if network_timeout_s is None else network_timeout_s
+            warnings.warn(
+                f"tritonclient.grpc doesn't support timeout for other commands than infer. Ignoring network_timeout: {network_timeout_s}.",
+                NotSupportedTimeoutWarning,
+                stacklevel=1,
+            )
+        triton_client_init_kwargs = self._get_init_extra_args()
+        _LOGGER.debug(
+            f"Creating InferenceServerClient for {self._triton_url.with_scheme} with {triton_client_init_kwargs}"
+        )
+        return self._triton_client_lib.InferenceServerClient(self._url, **triton_client_init_kwargs)
+    def get_lib(self):
+        """Returns tritonclient library for given scheme."""
+        raise NotImplementedError
+    @property
+    def _next_request_id(self) -> str:
+        # pytype complained about creating generator in __init__ method
+        # so we create it lazily
+        if getattr(self, "_request_id_generator", None) is None:
+            self._request_id_generator = itertools.count(0)
+        return str(next(self._request_id_generator))
+    def _get_init_extra_args(self):
+        timeout = self._inference_timeout_s  # pytype: disable=attribute-error
+        #  The inference timeout is used for both the HTTP and the GRPC protocols. However,
+        #  the way the timeout is passed to the client differs depending on the protocol.
+        #  For the HTTP protocol, the timeout is set in the ``__init__`` method as ``network_timeout``
+        #  and ``connection_timeout``. For the GRPC protocol, the timeout
+        #  is passed to the infer method as ``client_timeout``.
+        #  Both protocols support timeouts correctly and will raise an exception
+        #  if the network request or the inference process takes longer than the timeout.
+        #  This is a design choice of the underlying tritonclient library.
+        if self._triton_url.scheme != "http":
+            return {}
+        kwargs = {
+            # This value sets the maximum time allowed for each network request in both model loading and inference process
+            "network_timeout": timeout,
+            # This value sets the maximum time allowed for establishing a connection to the server.
+            # We use the inference timeout here instead of the init timeout because the init timeout
+            # is meant for waiting for the model to be ready. The connection timeout should be shorter
+            # than the init timeout because it only checks if connection is established (e.g. correct port)
+            "connection_timeout": timeout,
+        }
+        return kwargs
+    def _monkey_patch_client(self):
+        pass
+    def _get_model_config_extra_args(self):
+        # For the GRPC protocol, the timeout must be passed to the each request as client_timeout
+        # model_config doesn't yet support timeout but it is planned for the future
+        # grpc_network_timeout_s will be used for model_config
+        return {}
+    def _handle_lazy_init(self):
+        raise NotImplementedError
+def _run_once_per_lib(f):
+    def wrapper(_self):
+        if _self._triton_client_lib not in wrapper.patched:
+            wrapper.patched.add(_self._triton_client_lib)
+            return f(_self)
+    wrapper.patched = set()
+    return wrapper
+class ModelClient(BaseModelClient):
+    """Synchronous client for model deployed on the Triton Inference Server."""
+    def __init__(
+        self,
+        url: str,
+        model_name: str,
+        model_version: Optional[str] = None,
+        *,
+        lazy_init: bool = True,
+        init_timeout_s: Optional[float] = None,
+        inference_timeout_s: Optional[float] = None,
+        model_config: Optional[TritonModelConfig] = None,
+        ensure_model_is_ready: bool = True,
+    ):
+        """Inits ModelClient for given model deployed on the Triton Inference Server.
+        If `lazy_init` argument is False, model configuration will be read
+        from inference server during initialization.
+        Common usage:
+        ```python
+        client = ModelClient("localhost", "BERT")
+        result_dict = client.infer_sample(input1_sample, input2_sample)
+        client.close()
+        ```
+        Client supports also context manager protocol:
+        ```python
+        with ModelClient("localhost", "BERT") as client:
+            result_dict = client.infer_sample(input1_sample, input2_sample)
+        ```
+        The creation of client requires connection to the server and downloading model configuration. You can create client from existing client using the same class:
+        ```python
+        client = ModelClient.from_existing_client(existing_client)
+        ```
+        Args:
+            url: The Triton Inference Server url, e.g. 'grpc://localhost:8001'.
+                In case no scheme is provided http scheme will be used as default.
+                In case no port is provided default port for given scheme will be used -
+                8001 for grpc scheme, 8000 for http scheme.
+            model_name: name of the model to interact with.
+            model_version: version of the model to interact with.
+                If model_version is None inference on latest model will be performed.
+                The latest versions of the model are numerically the greatest version numbers.
+            lazy_init: if initialization should be performed just before sending first request to inference server.
+            init_timeout_s: timeout for maximum waiting time in loop, which sends retry requests ask if model is ready. It is applied at initialization time only when `lazy_init` argument is False. Default is to do retry loop at first inference.
+            inference_timeout_s: timeout in seconds for the model inference process.
+                If non passed default 60 seconds timeout will be used.
+                For HTTP client it is not only inference timeout but any client request timeout
+                - get model config, is model loaded. For GRPC client it is only inference timeout.
+            model_config: model configuration. If not passed, it will be read from inference server during initialization.
+            ensure_model_is_ready: if model should be checked if it is ready before first inference request.
+        Raises:
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientTimeoutError:
+                if `lazy_init` argument is False and wait time for server and model being ready exceeds `init_timeout_s`.
+            PyTritonClientUrlParseError: In case of problems with parsing url.
+        """
+        super().__init__(
+            url=url,
+            model_name=model_name,
+            model_version=model_version,
+            lazy_init=lazy_init,
+            init_timeout_s=init_timeout_s,
+            inference_timeout_s=inference_timeout_s,
+            model_config=model_config,
+            ensure_model_is_ready=ensure_model_is_ready,
+        )
+    def get_lib(self):
+        """Returns tritonclient library for given scheme."""
+        return {"grpc": tritonclient.grpc, "http": tritonclient.http}[self._triton_url.scheme.lower()]
+    def __enter__(self):
+        """Create context for using ModelClient as a context manager."""
+        return self
+    def __exit__(self, *_):
+        """Close resources used by ModelClient instance when exiting from the context."""
+        self.close()
+    def load_model(self, config: Optional[str] = None, files: Optional[dict] = None):
+        """Load model on the Triton Inference Server.
+        Args:
+            config: str - Optional JSON representation of a model config provided for
+                the load request, if provided, this config will be used for
+                loading the model.
+            files: dict - Optional dictionary specifying file path (with "file:" prefix) in
+                the override model directory to the file content as bytes.
+                The files will form the model directory that the model will be
+                loaded from. If specified, 'config' must be provided to be
+                the model configuration of the override model directory.
+        """
+        self._general_client.load_model(self._model_name, config=config, files=files)
+    def unload_model(self):
+        """Unload model from the Triton Inference Server."""
+        self._general_client.unload_model(self._model_name)
+    def close(self):
+        """Close resources used by ModelClient.
+        This method closes the resources used by the ModelClient instance,
+        including the Triton Inference Server connections.
+        Once this method is called, the ModelClient instance should not be used again.
+        """
+        _LOGGER.debug("Closing ModelClient")
+        try:
+            if self._general_client is not None:
+                self._general_client.close()
+            if self._infer_client is not None:
+                self._infer_client.close()
+            self._general_client = None
+            self._infer_client = None
+        except Exception as e:
+            _LOGGER.error(f"Error while closing ModelClient resources: {e}")
+            raise e
+    def wait_for_model(self, timeout_s: float):
+        """Wait for the Triton Inference Server and the deployed model to be ready.
+        Args:
+            timeout_s: timeout in seconds to wait for the server and model to be ready.
+        Raises:
+            PyTritonClientTimeoutError: If the server and model are not ready before the given timeout.
+            PyTritonClientModelUnavailableError: If the model with the given name (and version) is unavailable.
+            KeyboardInterrupt: If the hosting process receives SIGINT.
+            PyTritonClientClosedError: If the ModelClient is closed.
+        """
+        if self._general_client is None:
+            raise PyTritonClientClosedError("ModelClient is closed")
+        wait_for_model_ready(self._general_client, self._model_name, self._model_version, timeout_s=timeout_s)
+    @property
+    def is_batching_supported(self):
+        """Checks if model supports batching.
+        Also waits for server to get into readiness state.
+        """
+        return self.model_config.max_batch_size > 0
+    def wait_for_server(self, timeout_s: float):
+        """Wait for Triton Inference Server readiness.
+        Args:
+            timeout_s: timeout to server get into readiness state.
+        Raises:
+            PyTritonClientTimeoutError: If server is not in readiness state before given timeout.
+            KeyboardInterrupt: If hosting process receives SIGINT
+        """
+        wait_for_server_ready(self._general_client, timeout_s=timeout_s)
+    @property
+    def model_config(self) -> TritonModelConfig:
+        """Obtain the configuration of the model deployed on the Triton Inference Server.
+        This method waits for the server to get into readiness state before obtaining the model configuration.
+        Returns:
+            TritonModelConfig: configuration of the model deployed on the Triton Inference Server.
+        Raises:
+            PyTritonClientTimeoutError: If the server and model are not in readiness state before the given timeout.
+            PyTritonClientModelUnavailableError: If the model with the given name (and version) is unavailable.
+            KeyboardInterrupt: If the hosting process receives SIGINT.
+            PyTritonClientClosedError: If the ModelClient is closed.
+        """
+        if not self._model_config:
+            if self._general_client is None:
+                raise PyTritonClientClosedError("ModelClient is closed")
+            self._model_config = get_model_config(
+                self._general_client, self._model_name, self._model_version, timeout_s=self._init_timeout_s
+            )
+        return self._model_config
+    def infer_sample(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ) -> Dict[str, np.ndarray]:
+        """Run synchronous inference on a single data sample.
+        Typical usage:
+        ```python
+        client = ModelClient("localhost", "MyModel")
+        result_dict = client.infer_sample(input1, input2)
+        client.close()
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        result_dict = client.infer_sample(input1, input2)
+        result_dict = client.infer_sample(a=input1, b=input2)
+        ```
+        Args:
+            *inputs: Inference inputs provided as positional arguments.
+            parameters: Custom inference parameters.
+            headers: Custom inference headers.
+            **named_inputs: Inference inputs provided as named arguments.
+        Returns:
+            Dictionary with inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientValueError: If mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError: If the wait time for the server and model being ready exceeds `init_timeout_s` or
+                inference request time exceeds `inference_timeout_s`.
+            PyTritonClientModelUnavailableError: If the model with the given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If an error occurred on the inference callable or Triton Inference Server side.
+        """
+        _verify_inputs_args(inputs, named_inputs)
+        _verify_parameters(parameters)
+        _verify_parameters(headers)
+        if self.is_batching_supported:
+            if inputs:
+                inputs = tuple(data[np.newaxis, ...] for data in inputs)
+            elif named_inputs:
+                named_inputs = {name: data[np.newaxis, ...] for name, data in named_inputs.items()}
+        result = self._infer(inputs or named_inputs, parameters, headers)
+        return self._debatch_result(result)
+    def infer_batch(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ) -> Dict[str, np.ndarray]:
+        """Run synchronous inference on batched data.
+        Typical usage:
+        ```python
+        client = ModelClient("localhost", "MyModel")
+        result_dict = client.infer_batch(input1, input2)
+        client.close()
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        result_dict = client.infer_batch(input1, input2)
+        result_dict = client.infer_batch(a=input1, b=input2)
+        ```
+        Args:
+            *inputs: Inference inputs provided as positional arguments.
+            parameters: Custom inference parameters.
+            headers: Custom inference headers.
+            **named_inputs: Inference inputs provided as named arguments.
+        Returns:
+            Dictionary with inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientValueError: If mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError: If the wait time for the server and model being ready exceeds `init_timeout_s` or
+                inference request time exceeds `inference_timeout_s`.
+            PyTritonClientModelUnavailableError: If the model with the given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If an error occurred on the inference callable or Triton Inference Server side.
+            PyTritonClientModelDoesntSupportBatchingError: If the model doesn't support batching.
+            PyTritonClientValueError: if mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError:
+                in case of first method call, `lazy_init` argument is False
+                and wait time for server and model being ready exceeds `init_timeout_s` or
+                inference time exceeds `inference_timeout_s` passed to `__init__`.
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If error occurred on inference callable or Triton Inference Server side,
+        """
+        _verify_inputs_args(inputs, named_inputs)
+        _verify_parameters(parameters)
+        _verify_parameters(headers)
+        if not self.is_batching_supported:
+            raise PyTritonClientModelDoesntSupportBatchingError(
+                f"Model {self.model_config.model_name} doesn't support batching - use infer_sample method instead"
+            )
+        return self._infer(inputs or named_inputs, parameters, headers)
+    def _wait_and_init_model_config(self, init_timeout_s: float):
+        if self._general_client is None:
+            raise PyTritonClientClosedError("ModelClient is closed")
+        should_finish_before_s = time.time() + init_timeout_s
+        self.wait_for_model(init_timeout_s)
+        self._model_ready = True
+        timeout_s = max(0.0, should_finish_before_s - time.time())
+        self._model_config = get_model_config(
+            self._general_client, self._model_name, self._model_version, timeout_s=timeout_s
+        )
+    def _create_request(self, inputs: _IOType):
+        if self._infer_client is None:
+            raise PyTritonClientClosedError("ModelClient is closed")
+        if not self._model_ready:
+            self._wait_and_init_model_config(self._init_timeout_s)
+        if isinstance(inputs, Tuple):
+            inputs = {input_spec.name: input_data for input_spec, input_data in zip(self.model_config.inputs, inputs)}
+        inputs_wrapped = []
+        # to help pytype to obtain variable type
+        inputs: Dict[str, np.ndarray]
+        for input_name, input_data in inputs.items():
+            if input_data.dtype == object and not isinstance(input_data.reshape(-1)[0], bytes):
+                raise RuntimeError(
+                    f"Numpy array for {input_name!r} input with dtype=object should contain encoded strings \
+                    \\(e.g. into utf-8\\). Element type: {type(input_data.reshape(-1)[0])}"
+                )
+            if input_data.dtype.type == np.str_:
+                raise RuntimeError(
+                    "Unicode inputs are not supported. "
+                    f"Encode numpy array for {input_name!r} input (ex. with np.char.encode(array, 'utf-8'))."
+                )
+            triton_dtype = tritonclient.utils.np_to_triton_dtype(input_data.dtype)
+            infer_input = self._triton_client_lib.InferInput(input_name, input_data.shape, triton_dtype)
+            infer_input.set_data_from_numpy(input_data)
+            inputs_wrapped.append(infer_input)
+        outputs_wrapped = [
+            self._triton_client_lib.InferRequestedOutput(output_spec.name) for output_spec in self.model_config.outputs
+        ]
+        return inputs_wrapped, outputs_wrapped
+    def _infer(self, inputs: _IOType, parameters, headers) -> Dict[str, np.ndarray]:
+        if self.model_config.decoupled:
+            raise PyTritonClientInferenceServerError("Model config is decoupled. Use DecoupledModelClient instead.")
+        inputs_wrapped, outputs_wrapped = self._create_request(inputs)
+        try:
+            _LOGGER.debug("Sending inference request to Triton Inference Server")
+            response = self._infer_client.infer(
+                model_name=self._model_name,
+                model_version=self._model_version or "",
+                inputs=inputs_wrapped,
+                headers=headers,
+                outputs=outputs_wrapped,
+                request_id=self._next_request_id,
+                parameters=parameters,
+                **self._get_infer_extra_args(),
+            )
+        except tritonclient.utils.InferenceServerException as e:
+            # tritonclient.grpc raises execption with message containing "Deadline Exceeded" for timeout
+            if "Deadline Exceeded" in e.message():
+                raise PyTritonClientTimeoutError(
+                    f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s. Message: {e.message()}"
+                ) from e
+            raise PyTritonClientInferenceServerError(
+                f"Error occurred during inference request. Message: {e.message()}"
+            ) from e
+        except socket.timeout as e:  # tritonclient.http raises socket.timeout for timeout
+            message = f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s Message: {e}"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+        except OSError as e:  # tritonclient.http raises socket.error for connection error
+            message = f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s Message: {e}"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+        if isinstance(response, tritonclient.http.InferResult):
+            outputs = {
+                output["name"]: response.as_numpy(output["name"]) for output in response.get_response()["outputs"]
+            }
+        else:
+            outputs = {output.name: response.as_numpy(output.name) for output in response.get_response().outputs}
+        return outputs
+    def _get_numpy_result(self, result):
+        if isinstance(result, tritonclient.grpc.InferResult):
+            result = {output.name: result.as_numpy(output.name) for output in result.get_response().outputs}
+        else:
+            result = {output["name"]: result.as_numpy(output["name"]) for output in result.get_response()["outputs"]}
+        return result
+    def _debatch_result(self, result):
+        if self.is_batching_supported:
+            result = {name: data[0] for name, data in result.items()}
+        return result
+    def _handle_lazy_init(self):
+        if not self._lazy_init:
+            self._wait_and_init_model_config(self._init_timeout_s)
+    def _get_infer_extra_args(self):
+        if self._triton_url.scheme == "http":
+            return {}
+        # For the GRPC protocol, the timeout is passed to the infer method as client_timeout
+        # This timeout applies to the whole inference process and each network request
+        # The ``infer`` supports also timeout argument for both GRPC and HTTP.
+        # It is applied at server side and supported only for dynamic batching.
+        # However, it is not used here yet and planned for future release
+        kwargs = {"client_timeout": self._inference_timeout_s}
+        return kwargs
+    @_run_once_per_lib
+    def _monkey_patch_client(self):
+        """Monkey patch InferenceServerClient to catch error in __del__."""
+        _LOGGER.info(f"Patch ModelClient {self._triton_url.scheme}")
+        if not hasattr(self._triton_client_lib.InferenceServerClient, "__del__"):
+            return
+        old_del = self._triton_client_lib.InferenceServerClient.__del__
+        def _monkey_patched_del(self):
+            """Monkey patched del."""
+            try:
+                old_del(self)
+            except gevent.exceptions.InvalidThreadUseError:
+                _LOGGER.info("gevent.exceptions.InvalidThreadUseError in __del__ of InferenceServerClient")
+            except Exception as e:
+                _LOGGER.error("Exception in __del__ of InferenceServerClient: %s", e)
+        self._triton_client_lib.InferenceServerClient.__del__ = _monkey_patched_del
+class DecoupledModelClient(ModelClient):
+    """Synchronous client for decoupled model deployed on the Triton Inference Server."""
+    def __init__(
+        self,
+        url: str,
+        model_name: str,
+        model_version: Optional[str] = None,
+        *,
+        lazy_init: bool = True,
+        init_timeout_s: Optional[float] = None,
+        inference_timeout_s: Optional[float] = None,
+        model_config: Optional[TritonModelConfig] = None,
+        ensure_model_is_ready: bool = True,
+    ):
+        """Inits DecoupledModelClient for given decoupled model deployed on the Triton Inference Server.
+        Common usage:
+        ```python
+        client = DecoupledModelClient("localhost", "BERT")
+        for response in client.infer_sample(input1_sample, input2_sample):
+            print(response)
+        client.close()
+        ```
+        Args:
+            url: The Triton Inference Server url, e.g. `grpc://localhost:8001`.
+                In case no scheme is provided http scheme will be used as default.
+                In case no port is provided default port for given scheme will be used -
+                8001 for grpc scheme, 8000 for http scheme.
+            model_name: name of the model to interact with.
+            model_version: version of the model to interact with.
+                If model_version is None inference on latest model will be performed.
+                The latest versions of the model are numerically the greatest version numbers.
+            lazy_init: if initialization should be performed just before sending first request to inference server.
+            init_timeout_s: timeout in seconds for the server and model to be ready. If not passed, the default timeout of 300 seconds will be used.
+            inference_timeout_s: timeout in seconds for a single model inference request. If not passed, the default timeout of 60 seconds will be used.
+            model_config: model configuration. If not passed, it will be read from inference server during initialization.
+            ensure_model_is_ready: if model should be checked if it is ready before first inference request.
+        Raises:
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientTimeoutError:
+                if `lazy_init` argument is False and wait time for server and model being ready exceeds `init_timeout_s`.
+            PyTritonClientInvalidUrlError: If provided Triton Inference Server url is invalid.
+        """
+        super().__init__(
+            url,
+            model_name,
+            model_version,
+            lazy_init=lazy_init,
+            init_timeout_s=init_timeout_s,
+            inference_timeout_s=inference_timeout_s,
+            model_config=model_config,
+            ensure_model_is_ready=ensure_model_is_ready,
+        )
+        if self._triton_url.scheme == "http":
+            raise PyTritonClientValueError("DecoupledModelClient is only supported for grpc protocol")
+        self._queue = Queue()
+        self._lock = Lock()
+    def close(self):
+        """Close resources used by DecoupledModelClient."""
+        _LOGGER.debug("Closing DecoupledModelClient")
+        if self._lock.acquire(blocking=False):
+            try:
+                super().close()
+            finally:
+                self._lock.release()
+        else:
+            _LOGGER.warning("DecoupledModelClient is stil streaming answers")
+            self._infer_client.stop_stream(False)
+            super().close()
+    def _infer(self, inputs: _IOType, parameters, headers):
+        if not self._lock.acquire(blocking=False):
+            raise PyTritonClientInferenceServerError("Inference is already in progress")
+        if not self.model_config.decoupled:
+            raise PyTritonClientInferenceServerError("Model config is coupled. Use ModelClient instead.")
+        inputs_wrapped, outputs_wrapped = self._create_request(inputs)
+        if parameters is not None:
+            raise PyTritonClientValueError("DecoupledModelClient does not support parameters")
+        if headers is not None:
+            raise PyTritonClientValueError("DecoupledModelClient does not support headers")
+        try:
+            _LOGGER.debug("Sending inference request to Triton Inference Server")
+            if self._infer_client._stream is None:
+                self._infer_client.start_stream(callback=lambda result, error: self._response_callback(result, error))
+            self._infer_client.async_stream_infer(
+                model_name=self._model_name,
+                model_version=self._model_version or "",
+                inputs=inputs_wrapped,
+                outputs=outputs_wrapped,
+                request_id=self._next_request_id,
+                enable_empty_final_response=True,
+                **self._get_infer_extra_args(),
+            )
+        except tritonclient.utils.InferenceServerException as e:
+            # tritonclient.grpc raises execption with message containing "Deadline Exceeded" for timeout
+            if "Deadline Exceeded" in e.message():
+                raise PyTritonClientTimeoutError(
+                    f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s. Message: {e.message()}"
+                ) from e
+            raise PyTritonClientInferenceServerError(
+                f"Error occurred during inference request. Message: {e.message()}"
+            ) from e
+        except socket.timeout as e:  # tritonclient.http raises socket.timeout for timeout
+            message = f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s Message: {e}"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+        except OSError as e:  # tritonclient.http raises socket.error for connection error
+            message = f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s Message: {e}"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+        _LOGGER.debug("Returning response iterator")
+        return self._create_response_iterator()
+    def _response_callback(self, response, error):
+        _LOGGER.debug(f"Received response from Triton Inference Server: {response}")
+        if error:
+            _LOGGER.error(f"Error occurred during inference request. Message: {error}")
+            self._queue.put(error)
+        else:
+            actual_response = response.get_response()
+            # Check if the object is not None
+            triton_final_response = actual_response.parameters.get("triton_final_response")
+            if triton_final_response and triton_final_response.bool_param:
+                self._queue.put(None)
+            else:
+                result = self._get_numpy_result(response)
+                self._queue.put(result)
+    def _create_response_iterator(self):
+        try:
+            while True:
+                try:
+                    item = self._queue.get(self._inference_timeout_s)
+                except Empty as e:
+                    message = f"Timeout occurred during inference request. Timeout: {self._inference_timeout_s} s"
+                    _LOGGER.error(message)
+                    raise PyTritonClientTimeoutError(message) from e
+                if isinstance(item, Exception):
+                    message = f"Error occurred during inference request. Message: {item.message()}"
+                    _LOGGER.error(message)
+                    raise PyTritonClientInferenceServerError(message) from item
+                if item is None:
+                    break
+                yield item
+        finally:
+            self._lock.release()
+    def _debatch_result(self, result):
+        if self.is_batching_supported:
+            result = ({name: data[0] for name, data in result_.items()} for result_ in result)
+        return result
+    def _get_infer_extra_args(self):
+        # kwargs = super()._get_infer_extra_args()
+        kwargs = {}
+        # kwargs["enable_empty_final_response"] = True
+        return kwargs
+class AsyncioModelClient(BaseModelClient):
+    """Asyncio client for model deployed on the Triton Inference Server.
+    This client is based on Triton Inference Server Python clients and GRPC library:
+     - ``tritonclient.http.aio.InferenceServerClient``
+     - ``tritonclient.grpc.aio.InferenceServerClient``
+    It can wait for server to be ready with model loaded and then perform inference on it.
+    ``AsyncioModelClient`` supports asyncio context manager protocol.
+    Typical usage:
+    ```python
+    from pytriton.client import AsyncioModelClient
+    import numpy as np
+    input1_sample = np.random.rand(1, 3, 224, 224).astype(np.float32)
+    input2_sample = np.random.rand(1, 3, 224, 224).astype(np.float32)
+    client = AsyncioModelClient("localhost", "MyModel")
+    result_dict = await client.infer_sample(input1_sample, input2_sample)
+    print(result_dict["output_name"])
+    await client.close()
+    ```
+    """
+    def __init__(
+        self,
+        url: str,
+        model_name: str,
+        model_version: Optional[str] = None,
+        *,
+        lazy_init: bool = True,
+        init_timeout_s: Optional[float] = None,
+        inference_timeout_s: Optional[float] = None,
+        model_config: Optional[TritonModelConfig] = None,
+        ensure_model_is_ready: bool = True,
+    ):
+        """Inits ModelClient for given model deployed on the Triton Inference Server.
+        If `lazy_init` argument is False, model configuration will be read
+        from inference server during initialization.
+        Args:
+            url: The Triton Inference Server url, e.g. 'grpc://localhost:8001'.
+                In case no scheme is provided http scheme will be used as default.
+                In case no port is provided default port for given scheme will be used -
+                8001 for grpc scheme, 8000 for http scheme.
+            model_name: name of the model to interact with.
+            model_version: version of the model to interact with.
+                If model_version is None inference on latest model will be performed.
+                The latest versions of the model are numerically the greatest version numbers.
+            lazy_init: if initialization should be performed just before sending first request to inference server.
+            init_timeout_s: timeout for server and model being ready.
+            inference_timeout_s: timeout in seconds for a single model inference request. If not passed, the default timeout of 60 seconds will be used.
+            model_config: model configuration. If not passed, it will be read from inference server during initialization.
+            ensure_model_is_ready: if model should be checked if it is ready before first inference request.
+        Raises:
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientTimeoutError: if `lazy_init` argument is False and wait time for server and model being ready exceeds `init_timeout_s`.
+            PyTritonClientUrlParseError: In case of problems with parsing url.
+        """
+        super().__init__(
+            url=url,
+            model_name=model_name,
+            model_version=model_version,
+            lazy_init=lazy_init,
+            init_timeout_s=init_timeout_s,
+            inference_timeout_s=inference_timeout_s,
+            model_config=model_config,
+            ensure_model_is_ready=ensure_model_is_ready,
+        )
+    def get_lib(self):
+        """Get Triton Inference Server Python client library."""
+        return {"grpc": tritonclient.grpc.aio, "http": tritonclient.http.aio}[self._triton_url.scheme.lower()]
+    async def __aenter__(self):
+        """Create context for use AsyncioModelClient as a context manager."""
+        _LOGGER.debug("Entering AsyncioModelClient context")
+        try:
+            if not self._lazy_init:
+                _LOGGER.debug("Waiting in AsyncioModelClient context for model to be ready")
+                await self._wait_and_init_model_config(self._init_timeout_s)
+                _LOGGER.debug("Model is ready in AsyncioModelClient context")
+            return self
+        except Exception as e:
+            _LOGGER.error("Error occurred during AsyncioModelClient context initialization")
+            await self.close()
+            raise e
+    async def __aexit__(self, *_):
+        """Close resources used by AsyncioModelClient when exiting from context."""
+        await self.close()
+        _LOGGER.debug("Exiting AsyncioModelClient context")
+    async def close(self):
+        """Close resources used by _ModelClientBase."""
+        _LOGGER.debug("Closing InferenceServerClient")
+        await self._general_client.close()
+        await self._infer_client.close()
+        _LOGGER.debug("InferenceServerClient closed")
+    async def wait_for_model(self, timeout_s: float):
+        """Asynchronous wait for Triton Inference Server and deployed on it model readiness.
+        Args:
+            timeout_s: timeout to server and model get into readiness state.
+        Raises:
+            PyTritonClientTimeoutError: If server and model are not in readiness state before given timeout.
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            KeyboardInterrupt: If hosting process receives SIGINT
+        """
+        _LOGGER.debug(f"Waiting for model {self._model_name} to be ready")
+        try:
+            await asyncio.wait_for(
+                asyncio_wait_for_model_ready(
+                    self._general_client, self._model_name, self._model_version, timeout_s=timeout_s
+                ),
+                self._init_timeout_s,
+            )
+        except asyncio.TimeoutError as e:
+            message = f"Timeout while waiting for model {self._model_name} to be ready for {self._init_timeout_s}s"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+    @property
+    async def model_config(self):
+        """Obtain configuration of model deployed on the Triton Inference Server.
+        Also waits for server to get into readiness state.
+        """
+        try:
+            if not self._model_config:
+                kwargs = self._get_model_config_extra_args()
+                _LOGGER.debug(f"Obtaining model config for {self._model_name}")
+                self._model_config = await asyncio.wait_for(
+                    asyncio_get_model_config(
+                        self._general_client,
+                        self._model_name,
+                        self._model_version,
+                        timeout_s=self._init_timeout_s,
+                        **kwargs,
+                    ),
+                    self._init_timeout_s,
+                )
+                _LOGGER.debug(f"Obtained model config for {self._model_name}")
+            return self._model_config
+        except asyncio.TimeoutError as e:
+            message = f"Timeout while waiting for model {self._model_name} to be ready for {self._init_timeout_s}s"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+    async def infer_sample(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ):
+        """Run asynchronous inference on single data sample.
+        Typical usage:
+        ```python
+        client = AsyncioModelClient("localhost", "MyModel")
+        result_dict = await client.infer_sample(input1, input2)
+        await client.close()
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        result_dict = await client.infer_sample(input1, input2)
+        result_dict = await client.infer_sample(a=input1, b=input2)
+        ```
+        Mixing of argument passing conventions is not supported and will raise PyTritonClientRuntimeError.
+        Args:
+            *inputs: inference inputs provided as positional arguments.
+            parameters: custom inference parameters.
+            headers: custom inference headers.
+            **named_inputs: inference inputs provided as named arguments.
+        Returns:
+            dictionary with inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientValueError: if mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError:
+                in case of first method call, `lazy_init` argument is False
+                and wait time for server and model being ready exceeds `init_timeout_s`
+                or inference time exceeds `timeout_s`.
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If error occurred on inference callable or Triton Inference Server side.
+        """
+        _verify_inputs_args(inputs, named_inputs)
+        _verify_parameters(parameters)
+        _verify_parameters(headers)
+        _LOGGER.debug(f"Running inference for {self._model_name}")
+        model_config = await self.model_config
+        _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        model_supports_batching = model_config.max_batch_size > 0
+        if model_supports_batching:
+            if inputs:
+                inputs = tuple(data[np.newaxis, ...] for data in inputs)
+            elif named_inputs:
+                named_inputs = {name: data[np.newaxis, ...] for name, data in named_inputs.items()}
+        _LOGGER.debug(f"Running _infer for {self._model_name}")
+        result = await self._infer(inputs or named_inputs, parameters, headers)
+        _LOGGER.debug(f"_infer for {self._model_name} finished")
+        if model_supports_batching:
+            result = {name: data[0] for name, data in result.items()}
+        return result
+    async def infer_batch(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ):
+        """Run asynchronous inference on batched data.
+        Typical usage:
+        ```python
+        client = AsyncioModelClient("localhost", "MyModel")
+        result_dict = await client.infer_batch(input1, input2)
+        await client.close()
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        result_dict = await client.infer_batch(input1, input2)
+        result_dict = await client.infer_batch(a=input1, b=input2)
+        ```
+        Mixing of argument passing conventions is not supported and will raise PyTritonClientValueError.
+        Args:
+            *inputs: inference inputs provided as positional arguments.
+            parameters: custom inference parameters.
+            headers: custom inference headers.
+            **named_inputs: inference inputs provided as named arguments.
+        Returns:
+            dictionary with inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientValueError: if mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError:
+                in case of first method call, `lazy_init` argument is False
+                and wait time for server and model being ready exceeds `init_timeout_s`
+                or inference time exceeds `timeout_s`.
+            PyTritonClientModelDoesntSupportBatchingError: if model doesn't support batching.
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If error occurred on inference callable or Triton Inference Server side.
+        """
+        _verify_inputs_args(inputs, named_inputs)
+        _verify_parameters(parameters)
+        _verify_parameters(headers)
+        _LOGGER.debug(f"Running inference for {self._model_name}")
+        model_config = await self.model_config
+        _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        model_supports_batching = model_config.max_batch_size > 0
+        if not model_supports_batching:
+            _LOGGER.error(f"Model {model_config.model_name} doesn't support batching")
+            raise PyTritonClientModelDoesntSupportBatchingError(
+                f"Model {model_config.model_name} doesn't support batching - use infer_sample method instead"
+            )
+        _LOGGER.debug(f"Running _infer for {self._model_name}")
+        result = await self._infer(inputs or named_inputs, parameters, headers)
+        _LOGGER.debug(f"_infer for {self._model_name} finished")
+        return result
+    async def _wait_and_init_model_config(self, init_timeout_s: float):
+        """Asynchronous wait for model and obtain model configuration.
+        Args:
+            init_timeout_s: timeout for server and model being ready.
+        Raises:
+            PyTritonClientTimeoutError: if wait time for server and model being ready exceeds `init_timeout_s`
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+        """
+        try:
+            should_finish_before_s = time.time() + init_timeout_s
+            _LOGGER.debug(f"Waiting for model {self._model_name} to be ready")
+            await asyncio.wait_for(self.wait_for_model(init_timeout_s), init_timeout_s)
+            _LOGGER.debug(f"Model {self._model_name} is ready")
+            self._model_ready = True
+            timeout_s = max(0.0, should_finish_before_s - time.time())
+            _LOGGER.debug(f"Obtaining model config for {self._model_name}")
+            self._model_config = await asyncio.wait_for(
+                asyncio_get_model_config(
+                    self._general_client, self._model_name, self._model_version, timeout_s=timeout_s
+                ),
+                timeout_s,
+            )
+            _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        except asyncio.TimeoutError as e:
+            _LOGGER.error(f"Timeout exceeded while waiting for model {self._model_name} to be ready")
+            raise PyTritonClientTimeoutError(
+                f"Timeout exceeded while waiting for model {self._model_name} to be ready"
+            ) from e
+    def _validate_input(self, input_name, input_data):
+        if input_data.dtype == object and not isinstance(input_data.reshape(-1)[0], bytes):
+            raise RuntimeError(
+                f"Numpy array for {input_name!r} input with dtype=object should contain encoded strings \
+                \\(e.g. into utf-8\\). Element type: {type(input_data.reshape(-1)[0])}"
+            )
+        if input_data.dtype.type == np.str_:
+            raise RuntimeError(
+                "Unicode inputs are not supported. "
+                f"Encode numpy array for {input_name!r} input (ex. with np.char.encode(array, 'utf-8'))."
+            )
+    async def _execute_infer(self, model_config, inputs_wrapped, outputs_wrapped, parameters, headers) -> Any:
+        try:
+            _LOGGER.debug(f"Sending InferRequest for {self._model_name}")
+            kwargs = self._get_infer_extra_args()
+            response = await self._infer_client.infer(
+                model_name=self._model_name,
+                model_version=self._model_version or "",
+                inputs=inputs_wrapped,
+                headers=headers,
+                outputs=outputs_wrapped,
+                request_id=self._next_request_id,
+                parameters=parameters,
+                **kwargs,
+            )
+        except asyncio.exceptions.TimeoutError as e:
+            # HTTP aio client raises asyncio.exceptions.TimeoutError for timeout errors
+            message = f"Timeout exceeded while running inference for {self._model_name}"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+        except tritonclient.utils.InferenceServerException as e:
+            message = f"Error occurred on Triton Inference Server side:\n {e.message()}"
+            _LOGGER.error(message)
+            if "Deadline Exceeded" in e.message():
+                # GRPC aio client raises InferenceServerException with message "Deadline Exceeded"
+                # for timeout errors
+                raise PyTritonClientTimeoutError(message) from e
+            else:
+                raise PyTritonClientInferenceServerError(message) from e
+        _LOGGER.debug(f"Received InferResponse for {self._model_name}")
+        outputs = {output_spec.name: response.as_numpy(output_spec.name) for output_spec in model_config.outputs}
+        return outputs
+    async def _infer(self, inputs: _IOType, parameters, headers):
+        if self._model_ready:
+            _LOGGER.debug(f"Waiting for model {self._model_name} config")
+            await self._wait_and_init_model_config(self._init_timeout_s)
+            _LOGGER.debug(f"Model wait finished for {self._model_name}")
+        _LOGGER.debug(f"Obtaining config for {self._model_name}")
+        model_config = await self.model_config
+        _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        if model_config.decoupled:
+            raise PyTritonClientInferenceServerError(
+                "Model config is decoupled. Use DecouploedAsyncioModelClient instead."
+            )
+        if isinstance(inputs, Tuple):
+            inputs = {input_spec.name: input_data for input_spec, input_data in zip(model_config.inputs, inputs)}
+        inputs_wrapped = []
+        for input_name, input_data in inputs.items():
+            if isinstance(input_data, np.ndarray):
+                self._validate_input(input_name, input_data)
+                triton_dtype = tritonclient.utils.np_to_triton_dtype(input_data.dtype)
+                infer_input = self._triton_client_lib.InferInput(input_name, input_data.shape, triton_dtype)
+                infer_input.set_data_from_numpy(input_data)
+                input_wrapped = infer_input
+                inputs_wrapped.append(input_wrapped)
+            else:
+                raise PyTritonClientValueError(
+                    f"Input {input_name} is not a numpy array. Got {type(input_data)} instead."
+                )
+        outputs_wrapped = [
+            self._triton_client_lib.InferRequestedOutput(output_spec.name) for output_spec in model_config.outputs
+        ]
+        return await self._execute_infer(model_config, inputs_wrapped, outputs_wrapped, parameters, headers)
+    def _handle_lazy_init(self):
+        # Asynchronous lazy initialization is done in __aenter__ method
+        pass
+    def _get_init_extra_args(self):
+        #  The inference timeout is used for both the HTTP and the GRPC protocols. However,
+        #  the way the timeout is passed to the client differs depending on the protocol.
+        #  For the HTTP protocol, the timeout is set in the ``__init__`` method as ``conn_timeout`` for both connection and request timeouts.
+        #  For the GRPC protocol, the timeout
+        #  is passed to the infer method as ``client_timeout``.
+        #  Both protocols support timeouts correctly and will raise an exception
+        #  if the network request or the inference process takes longer than the timeout.
+        #  This is a design choice of the underlying tritonclient library.
+        if self._triton_url.scheme != "http":
+            return {}
+        kwargs = {
+            # This value sets the maximum time allowed for both connection and network requests in both model loading and inference process
+            "conn_timeout": self._inference_timeout_s,
+        }
+        return kwargs
+    def _get_infer_extra_args(self):
+        if self._triton_url.scheme == "http":
+            return {}
+        # For the GRPC protocol, the timeout is passed to the infer method as client_timeout
+        # This timeout applies to the whole inference process and each network request
+        # The ``infer`` supports also timeout argument for both GRPC and HTTP.
+        # It is applied at server side and supported only for dynamic batching.
+        # However, it is not used here yet and planned for future release
+        kwargs = {"client_timeout": self._inference_timeout_s}
+        return kwargs
+class AsyncioDecoupledModelClient(AsyncioModelClient):
+    """Asyncio client for model deployed on the Triton Inference Server.
+    This client is based on Triton Inference Server Python clients and GRPC library:
+    * ``tritonclient.grpc.aio.InferenceServerClient``
+    It can wait for server to be ready with model loaded and then perform inference on it.
+    ``AsyncioDecoupledModelClient`` supports asyncio context manager protocol.
+    The client is intended to be used with decoupled models and will raise an error if model is coupled.
+    Typical usage:
+    ```python
+    from pytriton.client import AsyncioDecoupledModelClient
+    import numpy as np
+    input1_sample = np.random.rand(1, 3, 224, 224).astype(np.float32)
+    input2_sample = np.random.rand(1, 3, 224, 224).astype(np.float32)
+    async with AsyncioDecoupledModelClient("grpc://localhost", "MyModel") as client:
+        async for result_dict in client.infer_sample(input1_sample, input2_sample):
+            print(result_dict["output_name"])
+    ```
+    """
+    async def infer_sample(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ):
+        """Run asynchronous inference on single data sample.
+        Typical usage:
+        ```python
+        async with AsyncioDecoupledModelClient("grpc://localhost", "MyModel") as client:
+            async for result_dict in client.infer_sample(input1_sample, input2_sample):
+                print(result_dict["output_name"])
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        results_iterator = client.infer_sample(input1, input2)
+        results_iterator = client.infer_sample(a=input1, b=input2)
+        ```
+        Mixing of argument passing conventions is not supported and will raise PyTritonClientRuntimeError.
+        Args:
+            *inputs: inference inputs provided as positional arguments.
+            parameters: custom inference parameters.
+            headers: custom inference headers.
+            **named_inputs: inference inputs provided as named arguments.
+        Returns:
+            Asynchronous generator, which generates dictionaries with partial inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientValueError: if mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError:
+                in case of first method call, `lazy_init` argument is False
+                and wait time for server and model being ready exceeds `init_timeout_s`
+                or inference time exceeds `timeout_s`.
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If error occurred on inference callable or Triton Inference Server side.
+        """
+        _verify_inputs_args(inputs, named_inputs)
+        _verify_parameters(parameters)
+        _verify_parameters(headers)
+        _LOGGER.debug(f"Running inference for {self._model_name}")
+        model_config = await self.model_config
+        _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        model_supports_batching = model_config.max_batch_size > 0
+        if model_supports_batching:
+            if inputs:
+                inputs = tuple(data[np.newaxis, ...] for data in inputs)
+            elif named_inputs:
+                named_inputs = {name: data[np.newaxis, ...] for name, data in named_inputs.items()}
+        _LOGGER.debug(f"Running _infer for {self._model_name}")
+        result = self._infer(inputs or named_inputs, parameters, headers)
+        _LOGGER.debug(f"_infer for {self._model_name} finished")
+        async for item in result:
+            if model_supports_batching:
+                debatched_item = {name: data[0] for name, data in item.items()}
+                yield debatched_item
+            else:
+                yield item
+    async def infer_batch(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ):
+        """Run asynchronous inference on batched data.
+        Typical usage:
+        ```python
+        async with AsyncioDecoupledModelClient("grpc://localhost", "MyModel") as client:
+            async for result_dict in client.infer_batch(input1_sample, input2_sample):
+                print(result_dict["output_name"])
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        results_iterator = client.infer_batch(input1, input2)
+        results_iterator = client.infer_batch(a=input1, b=input2)
+        ```
+        Mixing of argument passing conventions is not supported and will raise PyTritonClientRuntimeError.
+        Args:
+            *inputs: inference inputs provided as positional arguments.
+            parameters: custom inference parameters.
+            headers: custom inference headers.
+            **named_inputs: inference inputs provided as named arguments.
+        Returns:
+            Asynchronous generator, which generates dictionaries with partial inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientValueError: if mixing of positional and named arguments passing detected.
+            PyTritonClientTimeoutError:
+                in case of first method call, `lazy_init` argument is False
+                and wait time for server and model being ready exceeds `init_timeout_s`
+                or inference time exceeds `timeout_s`.
+            PyTritonClientModelDoesntSupportBatchingError: if model doesn't support batching.
+            PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+            PyTritonClientInferenceServerError: If error occurred on inference callable or Triton Inference Server side.
+        """
+        _verify_inputs_args(inputs, named_inputs)
+        _verify_parameters(parameters)
+        _verify_parameters(headers)
+        _LOGGER.debug(f"Running inference for {self._model_name}")
+        model_config = await self.model_config
+        _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        model_supports_batching = model_config.max_batch_size > 0
+        if not model_supports_batching:
+            _LOGGER.error(f"Model {model_config.model_name} doesn't support batching")
+            raise PyTritonClientModelDoesntSupportBatchingError(
+                f"Model {model_config.model_name} doesn't support batching - use infer_sample method instead"
+            )
+        _LOGGER.debug(f"Running _infer for {self._model_name}")
+        result = self._infer(inputs or named_inputs, parameters, headers)
+        _LOGGER.debug(f"_infer for {self._model_name} finished")
+        async for item in result:
+            yield item
+    async def _execute_infer(self, model_config, inputs_wrapped, outputs_wrapped, parameters, headers) -> Any:
+        # stream_infer siletly consumes all errors raised inside async_request_iterator and raises CancelledError
+        error_raised_inside_async_request_iterator = set()
+        try:
+            _LOGGER.debug(f"Sending InferRequest for {self._model_name}")
+            kwargs = self._get_infer_extra_args()
+            async def async_request_iterator(errors):
+                _LOGGER.debug(f"Begin creating InferRequestHeader for {self._model_name}")
+                try:
+                    yield {
+                        "model_name": self._model_name,
+                        "inputs": inputs_wrapped,
+                        "outputs": outputs_wrapped,
+                        "request_id": self._next_request_id,
+                        "sequence_id": 0,
+                        "sequence_start": True,
+                        "sequence_end": True,
+                    }
+                except Exception as e:
+                    _LOGGER.error(f"Error occurred while creating InferRequestHeader for {self._model_name}")
+                    errors.add(e)
+                    raise e
+                _LOGGER.debug(f"End creating InferRequestHeader for {self._model_name}")
+            response_iterator = self._infer_client.stream_infer(
+                inputs_iterator=async_request_iterator(error_raised_inside_async_request_iterator),
+                headers=headers,
+                **kwargs,
+            )
+            _LOGGER.debug(f"End preparing InferRequest for {self._model_name}")
+            while True:
+                try:
+                    try:
+                        response = await asyncio.wait_for(
+                            response_iterator.__anext__(),
+                            self._inference_timeout_s,
+                        )
+                    except asyncio.TimeoutError as e:
+                        message = f"Timeout while waiting for model {self._model_name} to return next response {self._inference_timeout_s}s"
+                        _LOGGER.error(message)
+                        raise PyTritonClientTimeoutError(message) from e
+                    result, error = response
+                    _LOGGER.debug(f"Received InferResponse for {self._model_name}")
+                    if error is not None:
+                        raise error
+                    else:
+                        partial_output = {
+                            output_spec.name: result.as_numpy(output_spec.name) for output_spec in model_config.outputs
+                        }
+                    yield partial_output
+                except StopAsyncIteration:
+                    break
+            _LOGGER.debug(f"End receiving InferResponse for {self._model_name}")
+        except asyncio.exceptions.TimeoutError as e:
+            # HTTP aio client raises asyncio.exceptions.TimeoutError for timeout errors
+            message = f"Timeout exceeded while running inference for {self._model_name}"
+            _LOGGER.error(message)
+            raise PyTritonClientTimeoutError(message) from e
+        except tritonclient.utils.InferenceServerException as e:
+            message = f"Error occurred on Triton Inference Server side:\n {e.message()}"
+            _LOGGER.error(message)
+            if "Deadline Exceeded" in e.message():
+                # GRPC aio client raises InferenceServerException with message "Deadline Exceeded"
+                # for timeout errors
+                raise PyTritonClientTimeoutError(message) from e
+            else:
+                raise PyTritonClientInferenceServerError(message) from e
+        except asyncio.exceptions.CancelledError as e:
+            _LOGGER.error(f"CancelledError occurred while streaming inference for {self._model_name}")
+            # stream_infer siletly consumes all errors raised inside async_request_iterator and raises CancelledError
+            if len(error_raised_inside_async_request_iterator) > 0:
+                _LOGGER.error(f"Re-raising error raised inside async_request_iterator for {self._model_name} ")
+                raise error_raised_inside_async_request_iterator.pop() from None
+            else:
+                raise e
+    async def _infer(self, inputs: _IOType, parameters, headers):
+        if self._model_ready:
+            _LOGGER.debug(f"Waiting for model {self._model_name} config")
+            await self._wait_and_init_model_config(self._init_timeout_s)
+            _LOGGER.debug(f"Model wait finished for {self._model_name}")
+        _LOGGER.debug(f"Obtaining config for {self._model_name}")
+        model_config = await self.model_config
+        _LOGGER.debug(f"Model config for {self._model_name} obtained")
+        if not model_config.decoupled:
+            raise PyTritonClientInferenceServerError("Model config is coupled. Use AsyncioModelClient instead.")
+        if isinstance(inputs, Tuple):
+            inputs = {input_spec.name: input_data for input_spec, input_data in zip(model_config.inputs, inputs)}
+        inputs_wrapped = []
+        for input_name, input_data in inputs.items():
+            if isinstance(input_data, np.ndarray):
+                self._validate_input(input_name, input_data)
+                triton_dtype = tritonclient.utils.np_to_triton_dtype(input_data.dtype)
+                infer_input = self._triton_client_lib.InferInput(input_name, input_data.shape, triton_dtype)
+                infer_input.set_data_from_numpy(input_data)
+                input_wrapped = infer_input
+                inputs_wrapped.append(input_wrapped)
+            else:
+                raise PyTritonClientValueError(
+                    f"Input {input_name} is not a numpy array. Got {type(input_data)} instead."
+                )
+        outputs_wrapped = [
+            self._triton_client_lib.InferRequestedOutput(output_spec.name) for output_spec in model_config.outputs
+        ]
+        result = self._execute_infer(model_config, inputs_wrapped, outputs_wrapped, parameters, headers)
+        async for item in result:
+            yield item
+    def _get_infer_extra_args(self):
+        if self._triton_url.scheme == "http":
+            raise PyTritonClientValueError("AsyncioDecoupledModelClient is only supported for grpc protocol")
+        warnings.warn(
+            f"tritonclient.aio.grpc doesn't support client_timeout parameter {self._inference_timeout_s} for infer_stream",
+            NotSupportedTimeoutWarning,
+            stacklevel=1,
+        )
+        return {}
+@contextlib.contextmanager
+def _hub_context():
+    hub = gevent.get_hub()
+    try:
+        yield hub
+    finally:
+        hub.destroy()
+_INIT = "init"
+_WAIT_FOR_MODEL = "wait_for_model"
+_MODEL_CONFIG = "model_config"
+_INFER_BATCH = "infer_batch"
+_INFER_SAMPLE = "infer_sample"
+_CLOSE = "close"
+class FuturesModelClient:
+    """A client for interacting with a model deployed on the Triton Inference Server using concurrent.futures.
+    This client allows asynchronous inference requests using a thread pool executor. It can be used to perform inference
+    on a model by providing input data and receiving the corresponding output data. The client can be used in a `with`
+    statement to ensure proper resource management.
+    Example usage with context manager:
+    ```python
+    with FuturesModelClient("localhost", "MyModel") as client:
+        result_future = client.infer_sample(input1=input1_data, input2=input2_data)
+        # do something else
+        print(result_future.result())
+    ```
+    Usage without context manager:
+    ```python
+    client = FuturesModelClient("localhost", "MyModel")
+    result_future = client.infer_sample(input1=input1_data, input2=input2_data)
+    # do something else
+    print(result_future.result())
+    client.close()
+    ```
+    """
+    def __init__(
+        self,
+        url: str,
+        model_name: str,
+        model_version: Optional[str] = None,
+        *,
+        max_workers: int = 128,
+        max_queue_size: int = 128,
+        non_blocking: bool = False,
+        init_timeout_s: Optional[float] = None,
+        inference_timeout_s: Optional[float] = None,
+    ):
+        """Initializes the FuturesModelClient for a given model.
+        Args:
+            url: The Triton Inference Server url, e.g. `grpc://localhost:8001`.
+            model_name: The name of the model to interact with.
+            model_version: The version of the model to interact with. If None, the latest version will be used.
+            max_workers: The maximum number of threads that can be used to execute the given calls. If None, there is not limit on the number of threads.
+            max_queue_size: The maximum number of requests that can be queued. If None, there is not limit on the number of requests.
+            non_blocking: If True, the client will raise a PyTritonClientQueueFullError if the queue is full. If False, the client will block until the queue is not full.
+            init_timeout_s: Timeout in seconds for server and model being ready. If non passed default 60 seconds timeout will be used.
+            inference_timeout_s: Timeout in seconds for the single model inference request. If non passed default 60 seconds timeout will be used.
+        """
+        self._url = url
+        self._model_name = model_name
+        self._model_version = model_version
+        self._threads = []
+        self._max_workers = max_workers
+        self._max_queue_size = max_queue_size
+        self._non_blocking = non_blocking
+        if self._max_workers is not None and self._max_workers <= 0:
+            raise ValueError("max_workers must be greater than 0")
+        if self._max_queue_size is not None and self._max_queue_size <= 0:
+            raise ValueError("max_queue_size must be greater than 0")
+        kwargs = {}
+        if self._max_queue_size is not None:
+            kwargs["maxsize"] = self._max_queue_size
+        self._queue = Queue(**kwargs)
+        self._queue.put((_INIT, None, None))
+        self._init_timeout_s = _DEFAULT_FUTURES_INIT_TIMEOUT_S if init_timeout_s is None else init_timeout_s
+        self._inference_timeout_s = inference_timeout_s
+        self._closed = False
+        self._lock = Lock()
+        self._existing_client = None
+    def __enter__(self):
+        """Create context for using FuturesModelClient as a context manager."""
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        """Close resources used by FuturesModelClient instance when exiting from the context."""
+        self.close()
+    def close(self, wait=True):
+        """Close resources used by FuturesModelClient.
+        This method closes the resources used by the FuturesModelClient instance, including the Triton Inference Server connections.
+        Once this method is called, the FuturesModelClient instance should not be used again.
+        Args:
+            wait: If True, then shutdown will not return until all running futures have finished executing.
+        """
+        if self._closed:
+            return
+        _LOGGER.debug("Closing FuturesModelClient.")
+        self._closed = True
+        for _ in range(len(self._threads)):
+            self._queue.put((_CLOSE, None, None))
+        if wait:
+            _LOGGER.debug("Waiting for futures to finish.")
+            for thread in self._threads:
+                thread.join()
+    def wait_for_model(self, timeout_s: float) -> Future:
+        """Returns a Future object which result will be None when the model is ready.
+        Typical usage:
+        ```python
+        with FuturesModelClient("localhost", "BERT") as client
+            future = client.wait_for_model(300.)
+            # do something else
+            future.result()   # wait rest of timeout_s time
+                                # till return None if model is ready
+                                # or raise PyTritonClientTimeutError
+        ```
+        Args:
+            timeout_s: The maximum amount of time to wait for the model to be ready, in seconds.
+        Returns:
+            A Future object which result is None when the model is ready.
+        """
+        return self._execute(
+            name=_WAIT_FOR_MODEL,
+            request=timeout_s,
+        )
+    def model_config(self) -> Future:
+        """Obtain the configuration of the model deployed on the Triton Inference Server.
+        This method returns a Future object that will contain the TritonModelConfig object when it is ready.
+        Client will wait init_timeout_s for the server to get into readiness state before obtaining the model configuration.
+        Returns:
+            A Future object that will contain the TritonModelConfig object when it is ready.
+        Raises:
+            PyTritonClientClosedError: If the FuturesModelClient is closed.
+        """
+        return self._execute(name=_MODEL_CONFIG)
+    def infer_sample(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ) -> Future:
+        """Run asynchronous inference on a single data sample and return a Future object.
+        This method allows the user to perform inference on a single data sample by providing input data and receiving the
+        corresponding output data. The method returns a Future object that wraps a dictionary of inference results, where dictionary keys are output names.
+        Example usage:
+        ```python
+        with FuturesModelClient("localhost", "BERT") as client:
+            result_future = client.infer_sample(input1=input1_data, input2=input2_data)
+            # do something else
+            print(result_future.result())
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        future = client.infer_sample(input1, input2)
+        future = client.infer_sample(a=input1, b=input2)
+        ```
+        Args:
+            *inputs: Inference inputs provided as positional arguments.
+            parameters: Optional dictionary of inference parameters.
+            headers: Optional dictionary of HTTP headers for the inference request.
+            **named_inputs: Inference inputs provided as named arguments.
+        Returns:
+            A Future object wrapping a dictionary of inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientClosedError: If the FuturesModelClient is closed.
+        """
+        return self._execute(
+            name=_INFER_SAMPLE,
+            request=(inputs, parameters, headers, named_inputs),
+        )
+    def infer_batch(
+        self,
+        *inputs,
+        parameters: Optional[Dict[str, Union[str, int, bool]]] = None,
+        headers: Optional[Dict[str, Union[str, int, bool]]] = None,
+        **named_inputs,
+    ) -> Future:
+        """Run asynchronous inference on batched data and return a Future object.
+        This method allows the user to perform inference on batched data by providing input data and receiving the corresponding output data.
+        The method returns a Future object that wraps a dictionary of inference results, where dictionary keys are output names.
+        Example usage:
+        ```python
+        with FuturesModelClient("localhost", "BERT") as client:
+            future = client.infer_batch(input1_sample, input2_sample)
+            # do something else
+            print(future.result())
+        ```
+        Inference inputs can be provided either as positional or keyword arguments:
+        ```python
+        future = client.infer_batch(input1, input2)
+        future = client.infer_batch(a=input1, b=input2)
+        ```
+        Mixing of argument passing conventions is not supported and will raise PyTritonClientValueError.
+        Args:
+            *inputs: Inference inputs provided as positional arguments.
+            parameters: Optional dictionary of inference parameters.
+            headers: Optional dictionary of HTTP headers for the inference request.
+            **named_inputs: Inference inputs provided as named arguments.
+        Returns:
+            A Future object wrapping a dictionary of inference results, where dictionary keys are output names.
+        Raises:
+            PyTritonClientClosedError: If the FuturesModelClient is closed.
+        """
+        return self._execute(name=_INFER_BATCH, request=(inputs, parameters, headers, named_inputs))
+    def _execute(self, name, request=None):
+        if self._closed:
+            raise PyTritonClientClosedError("FutureModelClient is already closed")
+        self._extend_thread_pool()
+        future = Future()
+        if self._non_blocking:
+            try:
+                self._queue.put_nowait((future, request, name))
+            except Full as e:
+                raise PyTritonClientQueueFullError("Queue is full") from e
+        else:
+            kwargs = {}
+            if self._inference_timeout_s is not None:
+                kwargs["timeout"] = self._inference_timeout_s
+            try:
+                self._queue.put((future, request, name), **kwargs)
+            except Full as e:
+                raise PyTritonClientQueueFullError("Queue is full") from e
+        return future
+    def _extend_thread_pool(self):
+        if self._closed:
+            return
+        with self._lock:
+            if not self._queue.empty() and (self._max_workers is None or len(self._threads) < self._max_workers):
+                _LOGGER.debug("Create new thread")
+                thread = Thread(target=self._worker)
+                self._threads.append(thread)
+                thread.start()
+            else:
+                _LOGGER.debug("No need to create new thread")
+    def _client_request_executor(self, client, request, name):
+        _LOGGER.debug(f"Running {name} for {self._model_name}")
+        if name == _INFER_SAMPLE:
+            inputs, parameters, headers, named_inputs = request
+            result = client.infer_sample(
+                *inputs,
+                parameters=parameters,
+                headers=headers,
+                **named_inputs,
+            )
+        elif name == _INFER_BATCH:
+            inputs, parameters, headers, named_inputs = request
+            result = client.infer_batch(
+                *inputs,
+                parameters=parameters,
+                headers=headers,
+                **named_inputs,
+            )
+        elif name == _MODEL_CONFIG:
+            result = client.model_config
+        elif name == _WAIT_FOR_MODEL:
+            timeout_s = request
+            result = client.wait_for_model(timeout_s)
+        else:
+            raise PyTritonClientValueError(f"Unknown request name {name}")
+        self._set_existing_client(client)
+        return result
+    def _create_client(self, lazy_init):
+        _LOGGER.debug(f"Creating ModelClient lazy_init={lazy_init}")
+        return ModelClient(
+            self._url,
+            self._model_name,
+            self._model_version,
+            lazy_init=lazy_init,
+            init_timeout_s=self._init_timeout_s,
+            inference_timeout_s=self._inference_timeout_s,
+        )
+    def _set_existing_client(self, client):
+        if client._model_config is not None:
+            with self._lock:
+                if self._existing_client is None:
+                    _LOGGER.debug("Setting existing client")
+                    self._existing_client = client
+    def _remove_existing_client(self, client):
+        if client is not None:
+            with self._lock:
+                if self._existing_client is not None:
+                    if self._existing_client is client:
+                        _LOGGER.debug("Resetting existing client")
+                        self._existing_client = None
+    def _worker(self):
+        _LOGGER.debug("Starting worker thread")
+        client = None
+        # Work around for AttributeError: '_Threadlocal' object has no attribute 'hub'
+        # gevent/_hub_local.py", line 77, in gevent._gevent_c_hub_local.get_hub_noargs
+        with _hub_context():
+            while True:
+                future, request, name = self._queue.get()
+                if future == _CLOSE:
+                    _LOGGER.debug("Closing thread")
+                    self._queue.task_done()
+                    break
+                if future == _INIT:
+                    with self._lock:
+                        if self._existing_client is None:
+                            try:
+                                _LOGGER.debug("Initial client creation")
+                                client = self._create_client(False)
+                                _LOGGER.debug("Setting existing client")
+                                self._existing_client = client
+                            except Exception as e:
+                                _LOGGER.warning(f"Error {e} occurred during init for {self._model_name}")
+                    continue
+                try:
+                    if client is None:
+                        with self._lock:
+                            if self._existing_client is not None:
+                                _LOGGER.debug("Creating new client from existing client")
+                                client = ModelClient.from_existing_client(self._existing_client)
+                    if client is None:
+                        _LOGGER.debug("Creating new client")
+                        client = self._create_client(name == _WAIT_FOR_MODEL)
+                    with client:
+                        self._set_existing_client(client)
+                        while True:
+                            try:
+                                result = self._client_request_executor(client, request, name)
+                                _LOGGER.debug(f"Finished {name} for {self._model_name}")
+                                future.set_result(result)
+                                self._queue.task_done()
+                            except Exception as e:
+                                _LOGGER.error(f"Error {e} occurred during {name} for {self._model_name}")
+                                future.set_exception(e)
+                                self._queue.task_done()
+                                break
+                            future, request, name = self._queue.get()
+                            if future == _CLOSE:
+                                _LOGGER.debug("Closing thread")
+                                self._queue.task_done()
+                                return
+                except Exception as e:
+                    _LOGGER.error(f"Error {e} occurred during {name} for {self._model_name}")
+                    future.set_exception(e)
+                    self._queue.task_done()
+                finally:
+                    self._remove_existing_client(client)
+                    client = None
+        _LOGGER.debug("Finishing worker thread")

stf/stf-api-alternative/pytriton/build/lib/pytriton/client/exceptions.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Exceptions thrown in pytriton.client module."""
+class PyTritonClientError(Exception):
+    """Generic pytriton client exception."""
+    def __init__(self, message: str):
+        """Initialize exception with message.
+        Args:
+            message: Error message
+        """
+        self._message = message
+    def __str__(self) -> str:
+        """String representation of error.
+        Returns:
+            Message content
+        """
+        return self._message
+    @property
+    def message(self):
+        """Get the exception message.
+        Returns:
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
+class PyTritonClientValueError(PyTritonClientError):
+    """Generic error raised in case of incorrect values are provided into API."""
+    pass
+class PyTritonClientInvalidUrlError(PyTritonClientValueError):
+    """Error raised when provided Triton Inference Server url is invalid."""
+    pass
+class PyTritonClientTimeoutError(PyTritonClientError):
+    """Timeout occurred during communication with the Triton Inference Server."""
+    pass
+class PyTritonClientModelUnavailableError(PyTritonClientError):
+    """Model with given name and version is unavailable on the given Triton Inference Server."""
+    pass
+class PyTritonClientClosedError(PyTritonClientError):
+    """Error raised in case of trying to use closed client."""
+    pass
+class PyTritonClientModelDoesntSupportBatchingError(PyTritonClientError):
+    """Error raised in case of trying to infer batch on model not supporting batching."""
+    pass
+class PyTritonClientInferenceServerError(PyTritonClientError):
+    """Error raised in case of error on inference callable or Triton Inference Server side."""
+    pass
+class PyTritonClientQueueFullError(PyTritonClientError):
+    """Error raised in case of trying to push request to full queue."""
+    pass

stf/stf-api-alternative/pytriton/build/lib/pytriton/client/utils.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility module supporting model clients."""
+import dataclasses
+import enum
+import logging
+import socket
+import sys
+import time
+import urllib
+import warnings
+from typing import Optional, Union
+import tritonclient.grpc
+import tritonclient.http
+import tritonclient.http.aio
+from grpc import RpcError
+from tritonclient.utils import InferenceServerException
+from pytriton.client.exceptions import PyTritonClientInvalidUrlError, PyTritonClientTimeoutError
+from pytriton.client.warnings import NotSupportedTimeoutWarning
+from pytriton.constants import DEFAULT_GRPC_PORT, DEFAULT_HTTP_PORT
+from pytriton.model_config.parser import ModelConfigParser
+_LOGGER = logging.getLogger(__name__)
+_TritonSyncClientType = Union[tritonclient.grpc.InferenceServerClient, tritonclient.http.InferenceServerClient]
+_DEFAULT_NETWORK_TIMEOUT_S = 60.0  # 1min
+_DEFAULT_WAIT_FOR_SERVER_READY_TIMEOUT_S = 60.0  # 1min
+_DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S = 300.0  # 5min
+LATEST_MODEL_VERSION = "<latest>"
+# Special value for model_version argument. If model_version is None, the latest version of the model is returned.
+class ModelState(enum.Enum):
+    """Describe model state in Triton.
+    Attributes:
+        LOADING: Loading of model
+        UNLOADING: Unloading of model
+        UNAVAILABLE: Model is missing or could not be loaded
+        READY: Model is ready for inference
+    """
+    LOADING = "LOADING"
+    UNLOADING = "UNLOADING"
+    UNAVAILABLE = "UNAVAILABLE"
+    READY = "READY"
+def parse_http_response(models):
+    """Parse model repository index response from Triton Inference Server for HTTP."""
+    models_states = {}
+    _LOGGER.debug("Parsing model repository index entries:")
+    for model in models:
+        _LOGGER.debug(f"    name={model.get('name')} version={model.get('version')} state={model.get('state')}")
+        if not model.get("version"):
+            continue
+        model_state = ModelState(model["state"]) if model.get("state") else ModelState.LOADING
+        models_states[(model["name"], model["version"])] = model_state
+    return models_states
+def parse_grpc_response(models):
+    """Parse model repository index response from Triton Inference Server for GRCP."""
+    models_states = {}
+    _LOGGER.debug("Parsing model repository index entries:")
+    for model in models:
+        _LOGGER.debug(f"    name={model.name} version={model.version} state={model.state}")
+        if not model.version:
+            continue
+        model_state = ModelState(model.state) if model.state else ModelState.LOADING
+        models_states[(model.name, model.version)] = model_state
+    return models_states
+def get_model_state(
+    client: _TritonSyncClientType,
+    model_name: str,
+    model_version: Optional[str] = None,
+) -> ModelState:
+    """Obtains state of the model deployed in Triton Inference Server.
+    Args:
+        client: Triton Inference Server client to use for communication
+        model_name: name of the model which state we're requesting.
+        model_version:
+            version of the model which state we're requesting.
+            If model_version is None state of latest model is returned.
+            The latest versions of the model are the numerically greatest version numbers.
+    Returns:
+        Model state. _ModelState.UNAVAILABLE is returned in case if model with given name and version is not found.
+    """
+    repository_index = client.get_model_repository_index()
+    if isinstance(repository_index, list):
+        models_states = parse_http_response(models=repository_index)
+    else:
+        models_states = parse_grpc_response(models=repository_index.models)
+    if model_version is None:
+        requested_model_states = {
+            version: state for (name, version), state in models_states.items() if name == model_name
+        }
+        if not requested_model_states:
+            return ModelState.UNAVAILABLE
+        else:
+            requested_model_states = sorted(requested_model_states.items(), key=lambda item: int(item[0]))
+            _latest_version, latest_version_state = requested_model_states[-1]
+            return latest_version_state
+    else:
+        state = models_states.get((model_name, model_version), ModelState.UNAVAILABLE)
+        return state
+def get_model_config(
+    client: _TritonSyncClientType,
+    model_name: str,
+    model_version: Optional[str] = None,
+    timeout_s: Optional[float] = None,
+):
+    """Obtain configuration of model deployed on the Triton Inference Server.
+    Function waits for server readiness.
+    Typical use:
+        client = tritonclient.grpc.Client("localhost:8001")
+        model_config = get_model_config(client, "MyModel", "1", 60.0)
+        model_config = get_model_config(client, "MyModel")
+    Args:
+        client: Triton Inference Server client to use for communication
+        model_name: name of the model which configuration we're requesting.
+        model_version:
+            version of the model which configuration we're requesting.
+            If model_version is None configuration of the latest model is returned.
+            The latest versions of the model are the numerically greatest version numbers.
+        timeout_s: timeout to finish model configuration obtain. Default value is 300.0 s.
+    Returns:
+        Configuration of requested model.
+    Raises:
+        PyTritonClientTimeoutError: If obtain of model configuration didn't finish before given timeout.
+        PyTritonClientModelUnavailableError: If model with given name (and version) is unavailable.
+    """
+    wait_for_model_ready(client, model_name=model_name, model_version=model_version, timeout_s=timeout_s)
+    model_version = model_version or ""
+    _LOGGER.debug(f"Obtaining model {model_name} config")
+    if isinstance(client, tritonclient.grpc.InferenceServerClient):
+        response = client.get_model_config(model_name, model_version, as_json=True)
+        model_config = response["config"]
+    else:
+        model_config = client.get_model_config(model_name, model_version)
+    model_config = ModelConfigParser.from_dict(model_config)
+    _LOGGER.debug(f"Model config: {model_config}")
+    return model_config
+def _warn_on_too_big_network_timeout(client: _TritonSyncClientType, timeout_s: float):
+    if isinstance(client, tritonclient.http.InferenceServerClient):
+        connection_pool = client._client_stub._connection_pool
+        network_reldiff_s = (connection_pool.network_timeout - timeout_s) / timeout_s
+        connection_reldiff_s = (connection_pool.connection_timeout - timeout_s) / timeout_s
+        rtol = 0.001
+        if network_reldiff_s > rtol or connection_reldiff_s > rtol:
+            warnings.warn(
+                "Client network and/or connection timeout is smaller than requested timeout_s. This may cause unexpected behavior. "
+                f"network_timeout={connection_pool.network_timeout} "
+                f"connection_timeout={connection_pool.connection_timeout} "
+                f"timeout_s={timeout_s}",
+                NotSupportedTimeoutWarning,
+                stacklevel=1,
+            )
+def wait_for_server_ready(
+    client: _TritonSyncClientType,
+    timeout_s: Optional[float] = None,
+):
+    """Waits for Triton Inference Server to be ready.
+    Typical use:
+        client = tritonclient.http.Client("localhost:8001")
+        wait_for_server_ready(client, timeout_s=600.0)
+    Args:
+        client: Triton Inference Server client to use for communication
+        timeout_s: timeout to server get into readiness state. Default value is 60.0 s.
+    Raises:
+        PyTritonClientTimeoutError: If obtain of model configuration didn't finish before given timeout.
+    """
+    timeout_s = timeout_s if timeout_s is not None else _DEFAULT_WAIT_FOR_SERVER_READY_TIMEOUT_S
+    should_finish_before_s = time.time() + timeout_s
+    _warn_on_too_big_network_timeout(client, timeout_s)
+    def _is_server_ready():
+        try:
+            return client.is_server_ready() and client.is_server_live()
+        except InferenceServerException:
+            return False
+        except (RpcError, ConnectionError, socket.gaierror):  # GRPC and HTTP clients raises these errors
+            return False
+        except Exception as e:
+            _LOGGER.exception(f"Exception while checking server readiness: {e}")
+            raise e
+    timeout_s = max(0.0, should_finish_before_s - time.time())
+    _LOGGER.debug(f"Waiting for server to be ready (timeout={timeout_s})")
+    is_server_ready = _is_server_ready()
+    while not is_server_ready:
+        time.sleep(min(1.0, timeout_s))
+        is_server_ready = _is_server_ready()
+        if not is_server_ready and time.time() >= should_finish_before_s:
+            raise PyTritonClientTimeoutError("Waiting for server to be ready timed out.")
+def wait_for_model_ready(
+    client: _TritonSyncClientType,
+    model_name: str,
+    model_version: Optional[str] = None,
+    timeout_s: Optional[float] = None,
+):
+    """Wait for Triton Inference Server to be ready.
+    Args:
+        client: Triton Inference Server client to use for communication.
+        model_name: name of the model to wait for readiness.
+        model_version:
+            version of the model to wait for readiness.
+            If model_version is None waiting for latest version of the model.
+            The latest versions of the model are the numerically greatest version numbers.
+        timeout_s: timeout to server and model get into readiness state. Default value is 300.0 s.
+    Raises:
+        PyTritonClientTimeoutError: If server readiness didn't finish before given timeout.
+    """
+    model_version = model_version or ""
+    model_version_msg = model_version or LATEST_MODEL_VERSION
+    timeout_s = timeout_s if timeout_s is not None else _DEFAULT_WAIT_FOR_MODEL_TIMEOUT_S
+    should_finish_before_s = time.time() + timeout_s
+    wait_for_server_ready(client, timeout_s=timeout_s)
+    timeout_s = max(0.0, should_finish_before_s - time.time())
+    _LOGGER.debug(f"Waiting for model {model_name}/{model_version_msg} to be ready (timeout={timeout_s})")
+    is_model_ready = client.is_model_ready(model_name, model_version)
+    while not is_model_ready:
+        time.sleep(min(1.0, timeout_s))
+        is_model_ready = client.is_model_ready(model_name, model_version)
+        if not is_model_ready and time.time() >= should_finish_before_s:
+            raise PyTritonClientTimeoutError(
+                f"Waiting for model {model_name}/{model_version_msg} to be ready timed out."
+            )
+def create_client_from_url(url: str, network_timeout_s: Optional[float] = None) -> _TritonSyncClientType:  # type: ignore
+    """Create Triton Inference Server client.
+    Args:
+        url: url of the server to connect to.
+            If url doesn't contain scheme (e.g. "localhost:8001") http scheme is added.
+            If url doesn't contain port (e.g. "localhost") default port for given scheme is added.
+        network_timeout_s: timeout for client commands. Default value is 60.0 s.
+    Returns:
+        Triton Inference Server client.
+    Raises:
+        PyTritonClientInvalidUrlError: If provided Triton Inference Server url is invalid.
+    """
+    url = TritonUrl.from_url(url)
+    triton_client_lib = {"grpc": tritonclient.grpc, "http": tritonclient.http}[url.scheme]
+    if url.scheme == "grpc":
+        # by default grpc client has very large number of timeout, thus we want to make it equal to http client timeout
+        network_timeout_s = _DEFAULT_NETWORK_TIMEOUT_S if network_timeout_s is None else network_timeout_s
+        warnings.warn(
+            f"tritonclient.grpc doesn't support timeout for other commands than infer. Ignoring network_timeout: {network_timeout_s}.",
+            NotSupportedTimeoutWarning,
+            stacklevel=1,
+        )
+    triton_client_init_kwargs = {}
+    if network_timeout_s is not None:
+        triton_client_init_kwargs.update(
+            **{
+                "grpc": {},
+                "http": {"connection_timeout": network_timeout_s, "network_timeout": network_timeout_s},
+            }[url.scheme]
+        )
+    _LOGGER.debug(f"Creating InferenceServerClient for {url.with_scheme} with {triton_client_init_kwargs}")
+    return triton_client_lib.InferenceServerClient(url.without_scheme, **triton_client_init_kwargs)
+@dataclasses.dataclass
+class TritonUrl:
+    """TritonUrl class for parsing Triton Inference Server url.
+    Attributes:
+        scheme: scheme of the url (http or grpc)
+        hostname: hostname of the url
+        port: port of the url
+    Examples:
+        triton_url = TritonUrl.from_url("localhost:8000")
+        triton_url.with_scheme
+        >>> "http://localhost:8000"
+        triton_url.without_scheme
+        >>> "localhost:8000"
+        triton_url.scheme, triton_url.hostname, triton_url.port
+        >>> ("http", "localhost", 8000)
+    """
+    scheme: str
+    hostname: str
+    port: int
+    @classmethod
+    def from_url(cls, url):
+        """Parse triton url and create TritonUrl instance.
+        Returns:
+            TritonUrl object with scheme, hostname and port.
+        """
+        if not isinstance(url, str):
+            raise PyTritonClientInvalidUrlError(f"Invalid url {url}. Url must be a string.")
+        try:
+            parsed_url = urllib.parse.urlparse(url)
+            # change in py3.9+
+            # https://github.com/python/cpython/commit/5a88d50ff013a64fbdb25b877c87644a9034c969
+            if sys.version_info < (3, 9) and not parsed_url.scheme and "://" in parsed_url.path:
+                raise ValueError(f"Invalid url {url}. Only grpc and http are supported.")
+            if (not parsed_url.scheme and "://" not in parsed_url.path) or (
+                sys.version_info >= (3, 9) and parsed_url.scheme and not parsed_url.netloc
+            ):
+                _LOGGER.debug(f"Adding http scheme to {url}")
+                parsed_url = urllib.parse.urlparse(f"http://{url}")
+            scheme = parsed_url.scheme.lower()
+            if scheme not in ["grpc", "http"]:
+                raise ValueError(f"Invalid scheme {scheme}. Only grpc and http are supported.")
+            port = parsed_url.port or {"grpc": DEFAULT_GRPC_PORT, "http": DEFAULT_HTTP_PORT}[scheme]
+        except ValueError as e:
+            raise PyTritonClientInvalidUrlError(f"Invalid url {url}") from e
+        return cls(scheme, parsed_url.hostname, port)
+    @property
+    def with_scheme(self):
+        """Get Triton Inference Server url with scheme."""
+        return f"{self.scheme}://{self.hostname}:{self.port}"
+    @property
+    def without_scheme(self):
+        """Get Triton Inference Server url without scheme."""
+        return f"{self.hostname}:{self.port}"

stf/stf-api-alternative/pytriton/build/lib/pytriton/client/warnings.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Warnings for pytriton module."""
+class PyTritonWarning(UserWarning):
+    """Base warning for pytriton module."""
+    pass
+class NotSupportedTimeoutWarning(PyTritonWarning):
+    """A warning for client, which doesn't support timeout configuration."""
+    pass

stf/stf-api-alternative/pytriton/build/lib/pytriton/constants.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104
+"""Constants for pytriton."""
+import os
+import pathlib
+DEFAULT_HTTP_PORT = 8000
+DEFAULT_GRPC_PORT = 8001
+DEFAULT_METRICS_PORT = 8002
+TRITON_LOCAL_IP = "127.0.0.1"
+TRITON_CONTEXT_FIELD_NAME = "triton_context"
+TRITON_PYTHON_BACKEND_INTERPRETER_DIRNAME = "python_backend_interpreter"
+DEFAULT_TRITON_STARTUP_TIMEOUT_S = 120
+CREATE_TRITON_CLIENT_TIMEOUT_S = 10
+__DEFAULT_PYTRITON_HOME = os.path.join(os.getenv("XDG_CACHE_HOME", "$HOME/.cache"), "pytriton")
+__PYTRITON_HOME = os.path.expanduser(os.path.expandvars(os.getenv("PYTRITON_HOME", __DEFAULT_PYTRITON_HOME)))
+PYTRITON_HOME = pathlib.Path(__PYTRITON_HOME).resolve().absolute()

stf/stf-api-alternative/pytriton/build/lib/pytriton/decorators.py ADDED Viewed

	@@ -0,0 +1,678 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference callable decorators."""
+import collections
+import dataclasses
+import inspect
+import itertools
+import operator
+import typing
+from bisect import bisect_left
+from collections.abc import MutableMapping
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
+import numpy as np
+import wrapt
+from pytriton.constants import TRITON_CONTEXT_FIELD_NAME
+from pytriton.exceptions import PyTritonBadParameterError, PyTritonRuntimeError, PyTritonValidationError
+from pytriton.model_config.triton_model_config import TritonModelConfig
+from pytriton.proxy.data import _serialize_byte_tensor
+from pytriton.proxy.telemetry import start_span_from_span
+class _WrappedWithWrapper(NamedTuple):
+    wrapped: Optional[Callable]
+    wrapper: Optional[Callable]
+InputNames = typing.List[str]
+InferenceRequest = typing.Dict[str, np.ndarray]
+InferenceRequests = typing.Union[typing.List[InferenceRequest], typing.Tuple[InferenceRequest, ...]]
+InferenceResult = typing.Dict[str, np.ndarray]
+InferenceResults = typing.Union[typing.List[InferenceResult], typing.Tuple[InferenceResult, ...]]
+def get_inference_request_batch_size(inference_request: InferenceRequest) -> int:
+    """Get batch size from triton request.
+    Args:
+        inference_request (InferenceRequest): Triton request.
+    Returns:
+        int: Batch size.
+    """
+    first_input_value = next(iter(inference_request.values()))
+    batch_size, *_dims = first_input_value.shape
+    return batch_size
+def _get_wrapt_stack(wrapped) -> List[_WrappedWithWrapper]:
+    """Returns stack of wrapped functions with wrappers applied to inference callable."""
+    stack = []
+    infer_callable = wrapped
+    while infer_callable is not None:
+        stack.append(_WrappedWithWrapper(infer_callable, getattr(infer_callable, "_self_wrapper", None)))
+        infer_callable = getattr(infer_callable, "__wrapped__", None)
+    return stack
+class ModelConfigDict(MutableMapping):
+    """Dictionary for storing model configs for inference callable."""
+    def __init__(self):
+        """Create ModelConfigDict object."""
+        self._data: Dict[str, TritonModelConfig] = {}
+        self._keys: List[Callable] = []
+    def __getitem__(self, infer_callable: Callable) -> TritonModelConfig:
+        """Get model config for inference callable."""
+        key = self._get_model_config_key(infer_callable)
+        return self._data[key]
+    def __setitem__(self, infer_callable: Callable, item: TritonModelConfig):
+        """Set model config for inference callable."""
+        self._keys.append(infer_callable)
+        key = self._get_model_config_key(infer_callable)
+        self._data[key] = item
+    def __delitem__(self, infer_callable: Callable):
+        """Delete model config for inference callable."""
+        key = self._get_model_config_key(infer_callable)
+        del self._data[key]
+    def __len__(self):
+        """Get number of inference callable keys."""
+        return len(self._data)
+    def __iter__(self):
+        """Iterate over inference callable keys."""
+        return iter(self._keys)
+    @staticmethod
+    def _get_model_config_key(infer_callable: Callable) -> str:
+        """Prepares TritonModelConfig dictionary key for function/callable."""
+        dict_key = infer_callable
+        if inspect.ismethod(dict_key) and dict_key.__name__ == "__call__":
+            dict_key = dict_key.__self__
+        return str(dict_key)
+@dataclasses.dataclass
+class TritonContext:
+    """Triton context definition class."""
+    model_configs: ModelConfigDict = dataclasses.field(default_factory=ModelConfigDict)
+def get_triton_context(wrapped, instance) -> TritonContext:
+    """Retrieves triton context from callable.
+    It is used in @triton_context to get triton context registered by triton binding in inference callable.
+    If you use @triton_context decorator you do not need this function.
+    """
+    caller = instance or wrapped
+    if not hasattr(caller, "__triton_context__"):
+        raise PyTritonValidationError("Wrapped function or object must bound with triton to get  __triton_context__")
+    return caller.__triton_context__
+def get_model_config(wrapped, instance) -> TritonModelConfig:
+    """Retrieves instance of TritonModelConfig from callable.
+    It is internally used in convert_output function to get output list from model.
+    You can use this in custom decorators if you need access to model_config information.
+    If you use @triton_context decorator you do not need this function (you can get model_config directly
+    from triton_context passing function/callable to dictionary getter).
+    """
+    return get_triton_context(wrapped, instance).model_configs[wrapped]
+def convert_output(
+    outputs: Union[Dict, List, Tuple], wrapped=None, instance=None, model_config: Optional[TritonModelConfig] = None
+):
+    """Converts output from tuple ot list to dictionary.
+    It is utility function useful for mapping output list into dictionary of outputs.
+    Currently, it is used in @sample and @batch decorators (we assume that user can return list or tuple of outputs
+    instead of dictionary if this list matches output list in model config (size and order).
+    """
+    if isinstance(outputs, dict):
+        return outputs
+    elif isinstance(outputs, (list, tuple)):
+        if model_config is None:
+            model_config = get_model_config(wrapped, instance)
+        if len(outputs) != len(model_config.outputs):
+            raise PyTritonValidationError("Outputs length different than config outputs length")
+        outputs = {config_output.name: output for config_output, output in zip(model_config.outputs, outputs)}
+        return outputs
+    else:
+        raise PyTritonValidationError(f"Unsupported output type {type(outputs)}.")
+@wrapt.decorator
+def sample(wrapped, instance, args, kwargs):
+    """Decorator is used for non-batched inputs to convert from one element list of requests to request kwargs.
+    Decorator takes first request and convert it into named inputs.
+    Useful with non-batching models - instead of one element list of request, we will get named inputs - `kwargs`.
+    """
+    kwargs.update(args[0][0])
+    outputs = wrapped(*args[1:], **kwargs)
+    outputs = convert_output(outputs, wrapped, instance)
+    return [outputs]
+@wrapt.decorator
+def batch(wrapped, instance, args, kwargs):
+    """Decorator for converting list of request dicts to dict of input batches.
+    Converts list of request dicts to dict of input batches.
+    It passes **kwargs to inference callable where each named input contains numpy array with batch of requests
+    received by Triton server.
+    We assume that each request has the same set of keys (you can use group_by_keys decorator before
+    using @batch decorator if your requests may have different set of keys).
+    Raises:
+        PyTritonValidationError: If the requests have different set of keys.
+        ValueError: If the output tensors have different than expected batch sizes. Expected batch size is
+            calculated as a sum of batch sizes of all requests.
+    """
+    telemetry_name = "pytriton-batch-decorator-span"
+    req_list = args[0]
+    input_names = req_list[0].keys()
+    for req_dict2 in req_list[1:]:
+        if input_names != req_dict2.keys():
+            raise PyTritonValidationError("Cannot batch requests with different set of inputs keys")
+    inputs = {}
+    for model_input in input_names:
+        concatenated_input_data = np.concatenate([req[model_input] for req in req_list])
+        inputs[model_input] = concatenated_input_data
+    args = args[1:]
+    new_kwargs = dict(kwargs)
+    new_kwargs.update(inputs)
+    spans = [start_span_from_span(request.span, telemetry_name) for request in req_list if request.span is not None]
+    try:
+        outputs = wrapped(*args, **new_kwargs)
+    finally:
+        for span in spans:
+            span.end()
+    def _split_result(_result):
+        outputs = convert_output(_result, wrapped, instance)
+        output_names = outputs.keys()
+        requests_total_batch_size = sum(get_inference_request_batch_size(req) for req in req_list)
+        not_matching_tensors_shapes = {
+            output_name: output_tensor.shape
+            for output_name, output_tensor in outputs.items()
+            if output_tensor.shape[0] != requests_total_batch_size
+        }
+        if not_matching_tensors_shapes:
+            raise ValueError(
+                f"Received output tensors with different batch sizes: {', '.join(': '.join(map(str, item)) for item in not_matching_tensors_shapes.items())}. "
+                f"Expected batch size: {requests_total_batch_size}. "
+            )
+        out_list = []
+        start_idx = 0
+        for request in req_list:
+            # get batch_size of first input for each request - assume that all inputs have same batch_size
+            request_batch_size = get_inference_request_batch_size(request)
+            req_output_dict = {}
+            for _output_ind, output_name in enumerate(output_names):
+                req_output = outputs[output_name][start_idx : start_idx + request_batch_size, ...]
+                req_output_dict[output_name] = req_output
+            out_list.append(req_output_dict)
+            start_idx += request_batch_size
+        return out_list
+    if inspect.isgenerator(outputs):
+        return (_split_result(_result) for _result in outputs)
+    else:
+        return _split_result(outputs)
+def group_by_values(*keys, pad_fn: typing.Optional[typing.Callable[[InferenceRequests], InferenceRequests]] = None):
+    """Decorator for grouping requests by values of selected keys.
+    This function splits a batch into multiple sub-batches based on the specified keys values and
+    calls the decorated function with each sub-batch. This is particularly useful when working with models
+    that require dynamic parameters sent by the user.
+    For example, given an input of the form:
+    ```python
+    {"sentences": [b"Sentence1", b"Sentence2", b"Sentence3"], "param1": [1, 1, 2], "param2": [1, 1, 1]}
+    ```
+    Using @group_by_values("param1", "param2") will split the batch into two sub-batches:
+    ```python
+    [
+        {"sentences": [b"Sentence1", b"Sentence2"], "param1": [1, 1], "param2": [1, 1]},
+        {"sentences": [b"Sentence3"], "param1": [2], "param2": [1]}
+    ]
+    ```
+    This decorator should be used after the @batch decorator.
+    Example usage:
+    ```python
+    @batch
+    @group_by_values("param1", "param2")
+    def infer_fun(**inputs):
+        ...
+        return outputs
+    ```
+    Args:
+        *keys: List of keys to group by.
+        pad_fn: Optional function to pad the batch to the same size before merging again to a single batch.
+    Returns:
+        The decorator function.
+    """
+    def value_to_key(value):
+        if isinstance(value, np.ndarray):
+            if value.dtype == np.object_ or value.dtype.type == np.bytes_:
+                return _serialize_byte_tensor(value)
+            else:
+                return value.tobytes()
+        return value
+    def _get_sort_key_for_sample(_request, _sample_idx: int):
+        return tuple(value_to_key(_request[_key][_sample_idx]) for _key in keys)
+    def _group_request(_request: InferenceRequest, _batch_size: int):
+        idx_inputs = [(sample_idx, _get_sort_key_for_sample(_request, sample_idx)) for sample_idx in range(_batch_size)]
+        idx_inputs.sort(key=operator.itemgetter(1))
+        for _, group in itertools.groupby(idx_inputs, key=operator.itemgetter(1)):
+            _samples_idxes, _ = zip(*group)
+            grouped_request = {input_name: value[_samples_idxes, ...] for input_name, value in _request.items()}
+            yield _samples_idxes, grouped_request
+    @wrapt.decorator
+    def _wrapper(wrapped, instance, args, kwargs):
+        wrappers_stack = [
+            callable_with_wrapper.wrapper
+            for callable_with_wrapper in _get_wrapt_stack(wrapped)
+            if callable_with_wrapper.wrapper is not None
+        ]
+        if batch in wrappers_stack:
+            raise PyTritonRuntimeError("The @group_by_values decorator must be used after the @batch decorator.")
+        request = {k: v for k, v in kwargs.items() if k not in _SPECIAL_KEYS}
+        other_kwargs = {k: v for k, v in kwargs.items() if k in _SPECIAL_KEYS}
+        batch_size = get_inference_request_batch_size(request)
+        sample_indices_with_interim_result = []
+        for sample_indices, _grouped_sub_request in _group_request(request, batch_size):
+            interim_result = wrapped(*args, **_grouped_sub_request, **other_kwargs)
+            sample_indices_with_interim_result.append((sample_indices, interim_result))
+        if pad_fn is not None:
+            indices, results = tuple(map(tuple, zip(*sample_indices_with_interim_result)))
+            results = pad_fn(results)
+            sample_indices_with_interim_result = tuple(zip(indices, results))
+        _, first_result_data = sample_indices_with_interim_result[0]
+        result = {
+            output_name: np.zeros((batch_size,) + data.shape[1:], dtype=data.dtype)
+            for output_name, data in first_result_data.items()
+        }
+        for indices, results in sample_indices_with_interim_result:
+            for output_name, data in results.items():
+                result[output_name][indices, ...] = data
+        return result
+    return _wrapper
+class ConstantPadder:
+    """Padder that pads the given batches with a constant value."""
+    def __init__(self, pad_value=0):
+        """Initialize the padder.
+        Args:
+            pad_value (int, optional): Padding value. Defaults to 0.
+        """
+        self.pad_value = pad_value
+    def __call__(self, batches_list: InferenceResults) -> InferenceResults:
+        """Pad the given batches with the specified value to pad size enabling further batching to single arrays.
+        Args:
+            batches_list (List[Dict[str, np.ndarray]]): List of batches to pad.
+        Returns:
+            List[Dict[str, np.ndarray]]: List of padded batches.
+        Raises:
+            PyTritonRuntimeError: If the input arrays for a given input name have different dtypes.
+        """
+        def _get_padded_shape(_batches: List[np.ndarray]) -> Tuple[int, ...]:
+            """Get the shape of the padded array without batch axis."""
+            return tuple(np.max([batch.shape[1:] for batch in _batches if batch is not None], axis=0))
+        def _get_padded_dtype(_batches: List[np.ndarray]) -> np.dtype:
+            dtypes = [batch.dtype for batch in _batches if batch is not None]
+            result_dtype = dtypes[0]
+            if not all(dtype.kind == result_dtype.kind for dtype in dtypes):
+                raise PyTritonRuntimeError("All input arrays for given input name must have the same dtype.")
+            # for bytes (encoded string) or unicode string need to obtain the max length
+            if result_dtype.kind in "SU":
+                order_and_kind = result_dtype.str[:2]
+                max_len = max([int(dtype.str[2:]) for dtype in dtypes])
+                result_dtype = f"{order_and_kind}{max_len}"
+            else:
+                if not all(dtype == result_dtype for dtype in dtypes):
+                    raise PyTritonRuntimeError("All input arrays for given input name must have the same dtype.")
+            return np.dtype(result_dtype)
+        input_names = list(
+            collections.OrderedDict.fromkeys(input_name for batch in batches_list for input_name in batch.keys())
+        )
+        batches_by_name = {input_name: [batch.get(input_name) for batch in batches_list] for input_name in input_names}
+        for input_batches in batches_by_name.values():
+            result_shape, result_dtype = _get_padded_shape(input_batches), _get_padded_dtype(input_batches)
+            for batch_idx, batch in enumerate(input_batches):
+                if batch is not None:
+                    input_batches[batch_idx] = np.pad(
+                        batch,
+                        [(0, 0)] + [(0, b - a) for a, b in zip(batch.shape[1:], result_shape)],
+                        mode="constant",
+                        constant_values=self.pad_value if result_dtype.kind not in ["S", "U", "O"] else b"",
+                    ).astype(result_dtype)
+        return [
+            {name: batches[batch_idx] for name, batches in batches_by_name.items() if batches[batch_idx] is not None}
+            for batch_idx in range(len(batches_list))
+        ]
+@wrapt.decorator
+def group_by_keys(wrapped, instance, args, kwargs):
+    """Group by keys.
+    Decorator prepares groups of requests with the same set of keys and calls wrapped function
+    for each group separately (it is convenient to use this decorator before batching, because the batching decorator
+    requires consistent set of inputs as it stacks them into batches).
+    """
+    inputs = args[0]
+    idx_inputs = [(idx, tuple(sorted(input.keys())), input) for idx, input in enumerate(inputs)]
+    idx_inputs.sort(key=operator.itemgetter(1))
+    idx_groups_res = []
+    for _, group in itertools.groupby(idx_inputs, key=operator.itemgetter(1)):
+        idx, _key, sample_list = zip(*group)
+        args = (list(sample_list),) + args[1:]
+        out = wrapped(*args, **kwargs)
+        idx_groups_res.extend(zip(idx, out))
+    idx_groups_res.sort(key=operator.itemgetter(0))
+    res_flat = [r[1] for r in idx_groups_res]
+    return res_flat
+def fill_optionals(**defaults):
+    """This decorator ensures that any missing inputs in requests are filled with default values specified by the user.
+    Default values should be NumPy arrays without batch axis.
+    If you plan to group requests ex. with
+    [@group_by_keys][pytriton.decorators.group_by_keys] or
+    [@group_by_vales][pytriton.decorators.group_by_values] decorators
+    provide default values for optional parameters at the beginning of decorators stack.
+    The other decorators can then group requests into bigger batches resulting in a better model performance.
+    Typical use:
+    ```python
+    @fill_optionals()
+    @group_by_keys()
+    @batch
+    def infer_fun(**inputs):
+        ...
+        return outputs
+    ```
+    Args:
+        defaults: keyword arguments containing default values for missing inputs
+    If you have default values for some optional parameter it is good idea to provide them at the very beginning,
+    so the other decorators (e.g. @group_by_keys) can make bigger consistent groups.
+    """
+    def _verify_defaults(model_config: TritonModelConfig):
+        inputs = {spec.name: spec for spec in model_config.inputs}
+        not_matching_default_names = sorted(set(defaults) - set(inputs))
+        if not_matching_default_names:
+            raise PyTritonBadParameterError(f"Could not found {', '.join(not_matching_default_names)} inputs")
+        non_numpy_items = {k: v for k, v in defaults.items() if not isinstance(v, np.ndarray)}
+        if non_numpy_items:
+            raise PyTritonBadParameterError(
+                f"Could not use {', '.join([f'{k}={v}' for k, v in non_numpy_items.items()])} defaults "
+                "as they are not NumPy arrays"
+            )
+        not_matching_dtypes = {k: (v.dtype, inputs[k].dtype) for k, v in defaults.items() if v.dtype != inputs[k].dtype}
+        if not_matching_dtypes:
+            non_matching_dtypes_str_list = [
+                f"{name}: dtype={have_dtype} expected_dtype={expected_dtype}"
+                for name, (have_dtype, expected_dtype) in not_matching_dtypes.items()
+            ]
+            raise PyTritonBadParameterError(
+                f"Could not use {', '.join(non_matching_dtypes_str_list)} "
+                f"defaults as they have different than input signature dtypes"
+            )
+        def _shape_match(_have_shape, _expected_shape):
+            return len(_have_shape) == len(_expected_shape) and all(
+                e == -1 or h == e for h, e in zip(_have_shape, _expected_shape)
+            )
+        not_matching_shapes = {
+            k: (v.shape, inputs[k].shape) for k, v in defaults.items() if not _shape_match(v.shape, inputs[k].shape)
+        }
+        if not_matching_shapes:
+            non_matching_shapes_str_list = [
+                f"{name}: shape={have_shape} expected_shape={expected_shape}"
+                for name, (have_shape, expected_shape) in not_matching_shapes.items()
+            ]
+            raise PyTritonBadParameterError(
+                f"Could not use {', '.join(non_matching_shapes_str_list)} "
+                f"defaults as they have different than input signature shapes"
+            )
+    @wrapt.decorator
+    def _wrapper(wrapped, instance, args, kwargs):
+        model_config = get_model_config(wrapped, instance)
+        _verify_defaults(model_config)
+        # verification if not after group wrappers is in group wrappers
+        (requests,) = args
+        model_supports_batching = model_config.batching
+        for request in requests:
+            batch_size = get_inference_request_batch_size(request) if model_supports_batching else None
+            for default_key, default_value in defaults.items():
+                if default_key in request:
+                    continue
+                if model_supports_batching:
+                    ones_reps = (1,) * default_value.ndim  # repeat once default_value on each axis
+                    axis_reps = (batch_size,) + ones_reps  # ... except on batch axis. we repeat it batch_size times
+                    default_value = np.tile(default_value, axis_reps)
+                request[default_key] = default_value
+        return wrapped(*args, **kwargs)
+    return _wrapper
+@wrapt.decorator
+def triton_context(wrapped, instance, args, kwargs):
+    """Adds triton context.
+    It gives you additional argument passed to the function in **kwargs called 'triton_context'.
+    You can read model config from it and in the future possibly have some interaction with triton.
+    """
+    kwargs[TRITON_CONTEXT_FIELD_NAME] = get_triton_context(wrapped, instance)
+    return wrapped(*args, **kwargs)
+@wrapt.decorator
+def pad_batch(wrapped, instance, args, kwargs):
+    """Add padding to the inputs batches.
+    Decorator appends last rows to the inputs multiple times to get desired batch size (preferred batch size or
+    max batch size from model config whatever is closer to current input size).
+    """
+    inputs = {k: v for k, v in kwargs.items() if k != "__triton_context__"}
+    first_input = next(iter(inputs.values()))
+    config = get_model_config(wrapped, instance)
+    batch_sizes = (
+        []
+        if (config.batcher is None or config.batcher.preferred_batch_size is None)
+        else sorted(config.batcher.preferred_batch_size)
+    )
+    batch_sizes.append(config.max_batch_size)
+    batch_size = batch_sizes[bisect_left(batch_sizes, first_input.shape[0])]
+    new_inputs = {
+        input_name: np.repeat(
+            input_array,
+            np.concatenate([
+                np.ones(input_array.shape[0] - 1),
+                np.array([batch_size - input_array.shape[0] + 1]),
+            ]).astype(np.int64),
+            axis=0,
+        )
+        for input_name, input_array in inputs.items()
+    }
+    kwargs.update(new_inputs)
+    return wrapped(*args, **kwargs)
+_SPECIAL_KEYS = ["__triton_context__"]
+def first_value(*keys: str, squeeze_single_values=True, strict: bool = True):
+    """This decorator overwrites selected inputs with first element of the given input.
+    It can be used in two ways:
+    1. Wrapping a single request inference callable by chaining with @batch decorator:
+        ```python
+        @batch
+        @first_value("temperature")
+        def infer_fn(**inputs):
+            ...
+            return result
+        ```
+    2. Wrapping a multiple requests inference callable:
+        ```python
+        @first_value("temperature")
+        def infer_fn(requests):
+            ...
+            return results
+        ```
+    By default, the decorator squeezes single value arrays to scalars.
+    This behavior can be disabled by setting the `squeeze_single_values` flag to False.
+    By default, the decorator checks the equality of the values on selected values.
+    This behavior can be disabled by setting the `strict` flag to False.
+    Wrapper can only be used with models that support batching.
+    Args:
+        keys: The input keys selected for conversion.
+        squeeze_single_values: squeeze single value ND array to scalar values. Defaults to True.
+        strict: enable checking if all values on single selected input of request are equal. Defaults to True.
+    Raises:
+        PyTritonRuntimeError: if not all values on a single selected input of the request are equal
+        and the strict flag is set to True. Additionally, if the decorator is used with a model that doesn't support batching,
+        PyTritonBadParameterError: if any of the keys passed to the decorator are not allowed.
+    """
+    if any(k in _SPECIAL_KEYS for k in keys):
+        not_allowed_keys = [key for key in keys if key in _SPECIAL_KEYS]
+        raise PyTritonBadParameterError(
+            f"The keys {', '.join(not_allowed_keys)} are not allowed as keys for @first_value wrapper. "
+            f"The set of not allowed keys are {', '.join(_SPECIAL_KEYS)}"
+        )
+    @wrapt.decorator
+    def wrapper(wrapped, instance, args, kwargs):
+        model_config = get_model_config(wrapped, instance)
+        if not model_config.batching:
+            raise PyTritonRuntimeError("The @first_value decorator can only be used with models that support batching.")
+        def _replace_inputs_with_first_value(_request):
+            for input_name in keys:
+                if input_name not in _request:
+                    continue
+                values = _request[input_name]
+                if strict:
+                    # do not set axis for arrays with strings (object) or models not supporting batching
+                    axis_of_uniqueness = None if values.dtype == object else 0
+                    unique_values = np.unique(values, axis=axis_of_uniqueness)
+                    if len(unique_values) > 1:
+                        raise PyTritonRuntimeError(
+                            f"The values on the {input_name!r} input are not equal. "
+                            "To proceed, either disable strict mode in @first_value wrapper "
+                            "or ensure that the values always are consistent. "
+                            f"The current values of {input_name!r} are {_request[input_name]!r}."
+                        )
+                _first_value = values[0]
+                if (
+                    squeeze_single_values
+                    and not np.isscalar(_first_value)
+                    and all(dim == 1 for dim in _first_value.shape)
+                ):
+                    _dim_0_array = np.squeeze(_first_value)
+                    _first_value = _dim_0_array[()]  # obtain scalar from 0-dim array with numpy type
+                _request[input_name] = _first_value
+            return _request
+        inputs_names = set(kwargs) - set(_SPECIAL_KEYS)
+        if inputs_names:
+            kwargs = _replace_inputs_with_first_value(kwargs)
+            return wrapped(*args, **kwargs)
+        else:
+            requests, *other_args = args
+            requests = [_replace_inputs_with_first_value(request) for request in requests]
+            return wrapped(requests, *other_args, **kwargs)
+    return wrapper

stf/stf-api-alternative/pytriton/build/lib/pytriton/exceptions.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTriton exceptions definition."""
+class PyTritonError(Exception):
+    """Generic PyTriton exception."""
+    def __init__(self, message: str):
+        """Initialize exception with message.
+        Args:
+            message: Error message
+        """
+        self._message = message
+    def __str__(self) -> str:
+        """Return exception as a string.
+        Returns:
+            Message content
+        """
+        return self._message
+    @property
+    def message(self):
+        """Get the exception message.
+        Returns:
+            The message associated with this exception, or None if no message.
+        """
+        return self._message
+class PyTritonValidationError(PyTritonError):
+    """PyTriton configuration validation exception."""
+    pass
+class PyTritonInvalidOperationError(PyTritonError):
+    """PyTriton invalid operation exception."""
+    pass
+class PyTritonBadParameterError(PyTritonError):
+    """PyTriton invalid parameter exception."""
+    pass
+class PyTritonModelConfigError(PyTritonError):
+    """PyTriton invalid model config exception."""
+    pass
+class PyTritonUnrecoverableError(PyTritonError):
+    """Unrecoverable error occurred in inference callable, thus no further inferences possible."""
+    pass
+class PyTritonRuntimeError(PyTritonError):
+    """Raised when an error is detected that doesn’t fall in any of the other categories."""
+    pass

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104
+from .common import DeviceKind, DynamicBatcher, QueuePolicy, TimeoutAction  # noqa: F401
+from .model_config import ModelConfig  # noqa: F401
+from .tensor import Tensor  # noqa: F401

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/common.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common structures for internal and external ModelConfig."""
+import dataclasses
+import enum
+from typing import Dict, Optional
+class DeviceKind(enum.Enum):
+    """Device kind for model deployment.
+    Args:
+        KIND_AUTO: Automatically select the device for model deployment.
+        KIND_CPU: Model is deployed on CPU.
+        KIND_GPU: Model is deployed on GPU.
+    """
+    KIND_AUTO = "KIND_AUTO"
+    KIND_CPU = "KIND_CPU"
+    KIND_GPU = "KIND_GPU"
+class TimeoutAction(enum.Enum):
+    """Timeout action definition for timeout_action QueuePolicy field.
+    Args:
+        REJECT: Reject the request and return error message accordingly.
+        DELAY: Delay the request until all other requests at the same (or higher) priority levels
+           that have not reached their timeouts are processed.
+    """
+    REJECT = "REJECT"
+    DELAY = "DELAY"
+@dataclasses.dataclass
+class QueuePolicy:
+    """Model queue policy configuration.
+    More in Triton Inference Server [documentation]
+    [documentation]: https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto#L1037
+    Args:
+        timeout_action: The action applied to timed-out request.
+        default_timeout_microseconds: The default timeout for every request, in microseconds.
+        allow_timeout_override: Whether individual request can override the default timeout value.
+        max_queue_size: The maximum queue size for holding requests.
+    """
+    timeout_action: TimeoutAction = TimeoutAction.REJECT
+    default_timeout_microseconds: int = 0
+    allow_timeout_override: bool = False
+    max_queue_size: int = 0
+@dataclasses.dataclass
+class DynamicBatcher:
+    """Dynamic batcher configuration.
+    More in Triton Inference Server [documentation]
+    [documentation]: https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto#L1104
+    Args:
+        max_queue_delay_microseconds: The maximum time, in microseconds, a request will be delayed in
+                                      the scheduling queue to wait for additional requests for batching.
+        preferred_batch_size: Preferred batch sizes for dynamic batching.
+        preserve_ordering : Should the dynamic batcher preserve the ordering of responses to
+                            match the order of requests received by the scheduler.
+        priority_levels: The number of priority levels to be enabled for the model.
+        default_priority_level: The priority level used for requests that don't specify their priority.
+        default_queue_policy: The default queue policy used for requests.
+        priority_queue_policy: Specify the queue policy for the priority level.
+    """
+    max_queue_delay_microseconds: int = 0
+    preferred_batch_size: Optional[list] = None
+    preserve_ordering: bool = False
+    priority_levels: int = 0
+    default_priority_level: int = 0
+    default_queue_policy: Optional[QueuePolicy] = None
+    priority_queue_policy: Optional[Dict[int, QueuePolicy]] = None

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/generator.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator class for creating Triton model config.
+The class consume the TritonModelConfig object as a constructor argument and produce the Triton model config in form of
+dict or file.
+    Typical usage example:
+        model_config = TritonModelConfig(model_name="simple")
+        generator = ModelConfigGenerator(model_config)
+        generator.to_file("/path/to/config.pbtxt")
+"""
+import json
+import logging
+import pathlib
+from typing import Dict, Union
+import numpy as np
+from google.protobuf import json_format, text_format  # pytype: disable=pyi-error
+from pytriton.exceptions import PyTritonBadParameterError
+from .triton_model_config import DynamicBatcher, TensorSpec, TritonModelConfig
+try:
+    import tritonclient.grpc as grpc_client
+    from tritonclient import utils as client_utils  # noqa: F401
+except ImportError:
+    try:
+        import tritonclientutils as client_utils  # noqa: F401
+        import tritongrpcclient as grpc_client
+    except ImportError:
+        client_utils = None
+        grpc_client = None
+LOGGER = logging.getLogger(__name__)
+class ModelConfigGenerator:
+    """Generate the protobuf config from ModelConfig object."""
+    def __init__(self, config: TritonModelConfig):
+        """Initialize generator.
+        Args:
+            config: model config object
+        """
+        self._config = config
+    def to_file(self, config_path: Union[str, pathlib.Path]) -> str:
+        """Serialize ModelConfig to prototxt and save to config_path directory.
+        Args:
+            config_path: path to configuration file
+        Returns:
+            A string with generated model configuration
+        """
+        from tritonclient.grpc import model_config_pb2  # pytype: disable=import-error
+        # https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto
+        model_config = self.get_config()
+        LOGGER.debug(f"Generated Triton config:\n{json.dumps(model_config, indent=4)}")
+        config_payload = json_format.ParseDict(model_config, model_config_pb2.ModelConfig())
+        LOGGER.debug(f"Generated Triton config payload:\n{config_payload}")
+        config_path = pathlib.Path(config_path)
+        config_path.parent.mkdir(parents=True, exist_ok=True)
+        model_config_bytes = text_format.MessageToBytes(config_payload)
+        # WAR: triton requires max_batch_size = 0 to be explicit written
+        # while this is not stored in payload during MessageToBytes
+        if model_config["max_batch_size"] == 0:
+            model_config_bytes += b"max_batch_size: 0\n"
+        with config_path.open("wb") as cfg:
+            cfg.write(model_config_bytes)
+        LOGGER.debug(f"Generated config stored in {config_path}")
+        return config_payload
+    def get_config(self) -> Dict:
+        """Create a Triton model config from ModelConfig object.
+        Returns:
+            Dict with model configuration data
+        """
+        model_config = {"name": self._config.model_name, "backend": self._config.backend}
+        self._set_batching(model_config)
+        self._set_model_signature(model_config)
+        self._set_instance_group(model_config)
+        self._set_model_transaction_policy(model_config)
+        self._set_backend_parameters(model_config)
+        self._set_response_cache(model_config)
+        return model_config
+    def _set_batching(self, model_config: Dict) -> None:
+        """Configure batching for model deployment on Triton Inference Server.
+        Args:
+            model_config: Dict with model config for Triton Inference Server
+        """
+        if not self._config.batching:
+            model_config["max_batch_size"] = 0
+            LOGGER.debug("Batching for model is disabled. The `max_batch_size` field value set to 0.")
+            return
+        elif self._config.max_batch_size < 1:
+            raise PyTritonBadParameterError("The `max_batch_size` must be greater or equal to 1.")
+        model_config["max_batch_size"] = self._config.max_batch_size
+        if isinstance(self._config.batcher, DynamicBatcher):
+            dynamic_batching_config = {}
+            if self._config.batcher.max_queue_delay_microseconds > 0:
+                dynamic_batching_config["maxQueueDelayMicroseconds"] = int(
+                    self._config.batcher.max_queue_delay_microseconds
+                )
+            if self._config.batcher.preferred_batch_size:
+                dynamic_batching_config["preferredBatchSize"] = [
+                    int(bs) for bs in self._config.batcher.preferred_batch_size
+                ]
+            if self._config.batcher.preserve_ordering:
+                dynamic_batching_config["preserveOrdering"] = self._config.batcher.preserve_ordering
+            if self._config.batcher.priority_levels:
+                dynamic_batching_config["priorityLevels"] = self._config.batcher.priority_levels
+            if self._config.batcher.default_priority_level:
+                if self._config.batcher.default_priority_level > self._config.batcher.priority_levels:
+                    raise PyTritonBadParameterError(
+                        "The `default_priority_level` must be between 1 and " f"{self._config.batcher.priority_levels}."
+                    )
+                dynamic_batching_config["defaultPriorityLevel"] = self._config.batcher.default_priority_level
+            if self._config.batcher.default_queue_policy:
+                priority_queue_policy_config = {
+                    "timeoutAction": self._config.batcher.default_queue_policy.timeout_action.value,
+                    "defaultTimeoutMicroseconds": int(
+                        self._config.batcher.default_queue_policy.default_timeout_microseconds
+                    ),
+                    "allowTimeoutOverride": self._config.batcher.default_queue_policy.allow_timeout_override,
+                    "maxQueueSize": int(self._config.batcher.default_queue_policy.max_queue_size),
+                }
+                dynamic_batching_config["defaultQueuePolicy"] = priority_queue_policy_config
+            if self._config.batcher.priority_queue_policy:
+                if not self._config.batcher.priority_levels:
+                    raise PyTritonBadParameterError(
+                        "Provide the `priority_levels` if you want to define `priority_queue_policy` "
+                        "for Dynamic Batching."
+                    )
+                priority_queue_policy_config = {}
+                for priority, queue_policy in self._config.batcher.priority_queue_policy.items():
+                    if priority < 0 or priority > self._config.batcher.priority_levels:
+                        raise PyTritonBadParameterError(
+                            f"Invalid `priority`={priority} provided. The value must be between "
+                            f"1 and {self._config.batcher.priority_levels}."
+                        )
+                    priority_queue_policy_config[priority] = {
+                        "timeoutAction": queue_policy.timeout_action.value,
+                        "defaultTimeoutMicroseconds": int(queue_policy.default_timeout_microseconds),
+                        "allowTimeoutOverride": queue_policy.allow_timeout_override,
+                        "maxQueueSize": int(queue_policy.max_queue_size),
+                    }
+                dynamic_batching_config["priorityQueuePolicy"] = priority_queue_policy_config
+            model_config["dynamic_batching"] = dynamic_batching_config
+        else:
+            LOGGER.debug("Default batching used")
+    def _set_instance_group(self, model_config: Dict) -> None:
+        """Configure instance group for model deployment on Triton Inference Server.
+        Args:
+            model_config: Dict with model config for Triton Inference Server
+        """
+        instance_groups = []
+        for device_kind, count in self._config.instance_group.items():
+            instance_groups.append({
+                "count": count,
+                "kind": device_kind.value,
+            })
+        if instance_groups:
+            model_config["instance_group"] = instance_groups
+    def _set_model_transaction_policy(self, model_config: Dict) -> None:
+        """Configure model transaction policy for model deployment on Triton Inference Server.
+        Args:
+            model_config: Dict with model config for Triton Inference Server
+        """
+        if self._config.decoupled:
+            model_config["model_transaction_policy"] = {"decoupled": True}
+    def _set_backend_parameters(self, model_config: Dict) -> None:
+        """Configure backend parameters for model deployment on Triton Inference Server.
+        Args:
+            model_config: Dict with model config for Triton Inference Server
+        """
+        parameters = {}
+        for key, value in self._config.backend_parameters.items():
+            parameters[key] = {
+                "string_value": str(value),
+            }
+        if parameters:
+            model_config["parameters"] = parameters
+    def _set_model_signature(self, model_config: Dict) -> None:
+        """Configure model signature  for model deployment on Triton Inference Server.
+        Args:
+            model_config: Dict with model config for Triton Inference Server
+        """
+        def _rewrite_io_spec(spec_: TensorSpec) -> Dict:
+            if spec_.dtype in [np.object_, object, bytes, np.bytes_]:
+                dtype = "TYPE_STRING"
+            else:
+                # pytype: disable=attribute-error
+                dtype = spec_.dtype().dtype
+                # pytype: enable=attribute-error
+                dtype = f"TYPE_{client_utils.np_to_triton_dtype(dtype)}"
+            dims = spec_.shape
+            item = {
+                "name": spec_.name,
+                "dims": list(dims),
+                "data_type": dtype,
+            }
+            if spec_.optional:
+                item["optional"] = True
+            return item
+        if self._config.inputs:
+            model_config["input"] = [_rewrite_io_spec(spec) for spec in self._config.inputs]
+        if self._config.outputs:
+            outputs = [_rewrite_io_spec(spec) for spec in self._config.outputs]
+            if outputs:
+                optional_outputs = [o for o in outputs if o.get("optional")]
+                if optional_outputs:
+                    raise PyTritonBadParameterError(
+                        "Optional flag for outputs is not supported. "
+                        f"Outputs marked as optional: {', '.join([o['name'] for o in optional_outputs])}."
+                    )
+                model_config["output"] = outputs
+    def _set_response_cache(self, model_config: Dict):
+        """Configure response cache for model.
+        Args:
+            model_config: Dictionary where configuration is attached.
+        """
+        if self._config.response_cache:
+            model_config["response_cache"] = {
+                "enable": self._config.response_cache.enable,
+            }

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/model_config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model configurations.
+Dataclasses with specialized deployment paths for models on Triton. The purpose of this module is to provide clear options
+to configure models of given types.
+The dataclasses are exposed in the user API.
+"""
+import dataclasses
+from pytriton.model_config import DynamicBatcher
+@dataclasses.dataclass
+class ModelConfig:
+    """Additional model configuration for running model through Triton Inference Server.
+    Args:
+        batching: Flag to enable/disable batching for model.
+        max_batch_size: The maximal batch size that would be handled by model.
+        batcher: Configuration of Dynamic Batching for the model.
+        response_cache: Flag to enable/disable response cache for the model
+        decoupled: Flag to enable/disable decoupled from requests execution
+    """
+    batching: bool = True
+    max_batch_size: int = 4
+    batcher: DynamicBatcher = dataclasses.field(default_factory=DynamicBatcher)
+    response_cache: bool = False
+    decoupled: bool = False

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/parser.py ADDED Viewed

	@@ -0,0 +1,258 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ModelConfigParser class definition.
+Provide functionality to parse the Triton model configuration stored in file or form of dictionary into the object of
+class ModelConfig.
+    Examples of use:
+        # Parse from dict
+        model_config = ModelConfigParser.from_dict(model_config_dict)
+        # Parse from file
+        model_config = ModelConfigParser.from_file("/path/to/config.pbtxt")
+"""
+import json
+import logging
+import pathlib
+from typing import Dict
+import numpy as np
+from google.protobuf import json_format, text_format  # pytype: disable=pyi-error
+from pytriton.exceptions import PyTritonModelConfigError
+from .common import QueuePolicy, TimeoutAction
+from .triton_model_config import DeviceKind, DynamicBatcher, ResponseCache, TensorSpec, TritonModelConfig
+try:
+    import tritonclient.grpc as grpc_client
+    from tritonclient import utils as client_utils  # noqa: F401
+except ImportError:
+    try:
+        import tritonclientutils as client_utils  # noqa: F401
+        import tritongrpcclient as grpc_client
+    except ImportError:
+        client_utils = None
+        grpc_client = None
+LOGGER = logging.getLogger(__name__)
+class ModelConfigParser:
+    """Provide functionality to parse dictionary or file to ModelConfig object."""
+    @classmethod
+    def from_dict(cls, model_config_dict: Dict) -> TritonModelConfig:
+        """Create ModelConfig from configuration stored in dictionary.
+        Args:
+            model_config_dict: Dictionary with model config
+        Returns:
+            A ModelConfig object with data parsed from the dictionary
+        """
+        LOGGER.debug(f"Parsing Triton config model from dict: \n{json.dumps(model_config_dict, indent=4)}")
+        if model_config_dict.get("max_batch_size", 0) > 0:
+            batching = True
+        else:
+            batching = False
+        dynamic_batcher_config = model_config_dict.get("dynamic_batching")
+        if dynamic_batcher_config is not None:
+            batcher = cls._parse_dynamic_batching(dynamic_batcher_config)
+        else:
+            batcher = None
+        instance_group = {
+            DeviceKind(entry["kind"]): entry.get("count") for entry in model_config_dict.get("instance_group", [])
+        }
+        decoupled = model_config_dict.get("model_transaction_policy", {}).get("decoupled", False)
+        backend_parameters_config = model_config_dict.get("parameters", [])
+        if isinstance(backend_parameters_config, list):
+            # If the backend_parameters_config is a list of strings, use them as keys with empty values
+            LOGGER.debug(f"backend_parameters_config is a list of strings: {backend_parameters_config}")
+            backend_parameters = {name: "" for name in backend_parameters_config}
+        elif isinstance(backend_parameters_config, dict):
+            # If the backend_parameters_config is a dictionary, use the key and "string_value" fields as key-value pairs
+            LOGGER.debug(f"backend_parameters_config is a dictionary: {backend_parameters_config}")
+            backend_parameters = {
+                name: backend_parameters_config[name]["string_value"] for name in backend_parameters_config
+            }
+        else:
+            # Otherwise, raise an error
+            LOGGER.error(
+                f"Invalid type {type(backend_parameters_config)} for backend_parameters_config: {backend_parameters_config}"
+            )
+            raise TypeError(f"Invalid type for backend_parameters_config: {type(backend_parameters_config)}")
+        inputs = [
+            cls.rewrite_io_spec(item, "input", idx) for idx, item in enumerate(model_config_dict.get("input", []))
+        ] or None
+        outputs = [
+            cls.rewrite_io_spec(item, "output", idx) for idx, item in enumerate(model_config_dict.get("output", []))
+        ] or None
+        response_cache_config = model_config_dict.get("response_cache")
+        if response_cache_config:
+            response_cache = cls._parse_response_cache(response_cache_config)
+        else:
+            response_cache = None
+        return TritonModelConfig(
+            model_name=model_config_dict["name"],
+            batching=batching,
+            max_batch_size=model_config_dict.get("max_batch_size", 0),
+            batcher=batcher,
+            inputs=inputs,
+            outputs=outputs,
+            instance_group=instance_group,
+            decoupled=decoupled,
+            backend_parameters=backend_parameters,
+            response_cache=response_cache,
+        )
+    @classmethod
+    def from_file(cls, *, config_path: pathlib.Path) -> TritonModelConfig:
+        """Create ModelConfig from configuration stored in file.
+        Args:
+            config_path: location of file with model config
+        Returns:
+            A ModelConfig object with data parsed from the file
+        """
+        from tritonclient.grpc import model_config_pb2  # pytype: disable=import-error
+        LOGGER.debug(f"Parsing Triton config model config_path={config_path}")
+        with config_path.open("r") as config_file:
+            payload = config_file.read()
+            model_config_proto = text_format.Parse(payload, model_config_pb2.ModelConfig())
+        model_config_dict = json_format.MessageToDict(model_config_proto, preserving_proto_field_name=True)
+        return ModelConfigParser.from_dict(model_config_dict=model_config_dict)
+    @classmethod
+    def rewrite_io_spec(cls, item: Dict, io_type: str, idx: int) -> TensorSpec:
+        """Rewrite the IO Spec provided in form of dictionary to TensorSpec.
+        Args:
+            item: IO data for input
+            io_type: Type of the IO (input or output)
+            idx: Index of IO
+        Returns:
+            TensorSpec with input or output data
+        """
+        name = item.get("name")
+        if not name:
+            raise PyTritonModelConfigError(f"Name for {io_type} at index {idx} not provided.")
+        data_type = item.get("data_type")
+        if not data_type:
+            raise PyTritonModelConfigError(f"Data type for {io_type} with name `{name}` not defined.")
+        data_type_val = data_type.split("_")
+        if len(data_type_val) != 2:
+            raise PyTritonModelConfigError(
+                f"Invalid data type `{data_type}` for {io_type} with name `{name}` not defined. "
+                "The expected name is TYPE_{type}."
+            )
+        data_type = data_type_val[1]
+        if data_type == "STRING":
+            dtype = np.bytes_
+        else:
+            dtype = client_utils.triton_to_np_dtype(data_type)
+            if dtype is None:
+                raise PyTritonModelConfigError(f"Unsupported data type `{data_type}` for {io_type} with name `{name}`")
+            dtype = np.dtype("bool") if dtype is bool else dtype
+        dims = item.get("dims", [])
+        if not dims:
+            raise PyTritonModelConfigError(f"Dimension for {io_type} with name `{name}` not defined.")
+        shape = tuple(int(s) for s in dims)
+        optional = item.get("optional", False)
+        return TensorSpec(name=item["name"], shape=shape, dtype=dtype, optional=optional)
+    @classmethod
+    def _parse_dynamic_batching(cls, dynamic_batching_config: Dict) -> DynamicBatcher:
+        """Parse config to create DynamicBatcher object.
+        Args:
+            dynamic_batching_config: Configuration of dynamic batcher from config
+        Returns:
+            DynamicBatcher object with configuration
+        """
+        default_queue_policy = None
+        default_queue_policy_config = dynamic_batching_config.get("default_queue_policy")
+        if default_queue_policy_config:
+            default_queue_policy = QueuePolicy(
+                timeout_action=TimeoutAction(
+                    default_queue_policy_config.get("timeout_action", TimeoutAction.REJECT.value)
+                ),
+                default_timeout_microseconds=int(default_queue_policy_config.get("default_timeout_microseconds", 0)),
+                allow_timeout_override=bool(default_queue_policy_config.get("allow_timeout_override", False)),
+                max_queue_size=int(default_queue_policy_config.get("max_queue_size", 0)),
+            )
+        priority_queue_policy = None
+        priority_queue_policy_config = dynamic_batching_config.get("priority_queue_policy")
+        if priority_queue_policy_config:
+            priority_queue_policy = {}
+            for priority, queue_policy_config in priority_queue_policy_config.items():
+                queue_policy = QueuePolicy(
+                    timeout_action=TimeoutAction(queue_policy_config.get("timeout_action", TimeoutAction.REJECT.value)),
+                    default_timeout_microseconds=int(queue_policy_config.get("default_timeout_microseconds", 0)),
+                    allow_timeout_override=bool(queue_policy_config.get("allow_timeout_override", False)),
+                    max_queue_size=int(queue_policy_config.get("max_queue_size", 0)),
+                )
+                priority_queue_policy[int(priority)] = queue_policy
+        batcher = DynamicBatcher(
+            preferred_batch_size=dynamic_batching_config.get("preferred_batch_size"),
+            max_queue_delay_microseconds=int(dynamic_batching_config.get("max_queue_delay_microseconds", 0)),
+            preserve_ordering=bool(dynamic_batching_config.get("preserve_ordering", False)),
+            priority_levels=int(dynamic_batching_config.get("priority_levels", 0)),
+            default_priority_level=int(dynamic_batching_config.get("default_priority_level", 0)),
+            default_queue_policy=default_queue_policy,
+            priority_queue_policy=priority_queue_policy,
+        )
+        return batcher
+    @classmethod
+    def _parse_response_cache(cls, response_cache_config: Dict) -> ResponseCache:
+        """Parse config for response cache.
+        Args:
+            response_cache_config: response cache configuration
+        Returns:
+            ResponseCache object with configuration
+        """
+        response_cache = ResponseCache(
+            enable=bool(response_cache_config["enable"]),
+        )
+        return response_cache

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/tensor.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tensor object definition.
+Describe the model input or output.
+    Examples of use:
+        # Minimal constructors
+        tensor = Tensor(dtype=np.bytes_, shape=(-1,))
+        tensor = Tensor(dtype=np.float32, shape=(-1,))
+        # Type definition from existing object
+        a = np.array([1, 2, 3, 4])
+        tensor = Tensor(dtype=a.dtype, shape=(-1,))
+        # Custom name
+        tensor = Tensor(name="data", dtype=np.float32, shape=(16,))
+"""
+import dataclasses
+from typing import Optional, Type, Union
+import numpy as np
+@dataclasses.dataclass(frozen=True)
+class Tensor:
+    """Model input and output definition for Triton deployment.
+    Args:
+        shape: Shape of the input/output tensor.
+        dtype: Data type of the input/output tensor.
+        name: Name of the input/output of model.
+        optional: Flag to mark if input is optional.
+    """
+    shape: tuple
+    dtype: Union[np.dtype, Type[np.dtype], Type[object]]
+    name: Optional[str] = None
+    optional: Optional[bool] = False
+    def __post_init__(self):
+        """Override object values on post init or field override."""
+        if isinstance(self.dtype, np.dtype):
+            object.__setattr__(self, "dtype", self.dtype.type)  # pytype: disable=attribute-error

stf/stf-api-alternative/pytriton/build/lib/pytriton/model_config/triton_model_config.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ModelConfig related objects."""
+import dataclasses
+from typing import Dict, Optional, Sequence, Type, Union
+import numpy as np
+from .common import DeviceKind, DynamicBatcher
+@dataclasses.dataclass
+class ResponseCache:
+    """Model response cache configuration.
+    More in Triton Inference Server [documentation]
+    [documentation]: https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto#L1765
+    """
+    enable: bool
+@dataclasses.dataclass
+class TensorSpec:
+    """Stores specification of single tensor. This includes name, shape and dtype."""
+    name: str
+    shape: tuple
+    dtype: Union[Type[np.dtype], Type[object]]
+    optional: Optional[bool] = False
+@dataclasses.dataclass
+class TritonModelConfig:
+    """Triton Model Config dataclass for simplification and specialization of protobuf config generation.
+    More in Triton Inference Server [documentation]
+    [documentation]: https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto
+    """
+    model_name: str
+    model_version: int = 1
+    max_batch_size: int = 4
+    batching: bool = True
+    batcher: Optional[DynamicBatcher] = None
+    instance_group: Dict[DeviceKind, Optional[int]] = dataclasses.field(default_factory=lambda: {})
+    decoupled: bool = False
+    backend_parameters: Dict[str, str] = dataclasses.field(default_factory=lambda: {})
+    inputs: Optional[Sequence[TensorSpec]] = None
+    outputs: Optional[Sequence[TensorSpec]] = None
+    response_cache: Optional[ResponseCache] = None
+    @property
+    def backend(self) -> str:
+        """Return backend parameter."""
+        return "python"

stf/stf-api-alternative/pytriton/build/lib/pytriton/models/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104

stf/stf-api-alternative/pytriton/build/lib/pytriton/models/manager.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ModelManager class.
+The ModelManager is responsible for maintaining the models that has to be server on Triton Inference Server.
+    Examples of use:
+        manager = ModelManager(model_repository)
+        manager.add_model(model)
+        manager.create_models()
+"""
+import contextlib
+import json
+import logging
+import pathlib
+import socket
+from typing import Dict, Iterable, Optional, Tuple
+from tritonclient.grpc import InferenceServerException
+from pytriton.client import ModelClient
+from pytriton.client.utils import create_client_from_url, wait_for_server_ready
+from pytriton.constants import CREATE_TRITON_CLIENT_TIMEOUT_S, DEFAULT_TRITON_STARTUP_TIMEOUT_S
+from pytriton.exceptions import PyTritonInvalidOperationError
+from pytriton.models.model import Model
+LOGGER = logging.getLogger(__name__)
+class ModelManager:
+    """ModelManager class for maintaining Triton models."""
+    def __init__(
+        self,
+        triton_url: str,
+        model_store_path: Optional[pathlib.Path] = None,
+    ):
+        """Create ModelManager object.
+        Args:
+            triton_url: Triton server URL
+            model_store_path: Path to local model store
+        """
+        self._triton_url = triton_url
+        self._models: Dict[Tuple[str, int], Model] = {}
+        self._model_store_path = model_store_path
+    @property
+    def models(self) -> Iterable[Model]:
+        """List models added to manage.
+        Returns:
+            List with models added to ModelManager.
+        """
+        return self._models.values()
+    def add_model(self, model: Model, load_model: bool = False) -> None:
+        """Add model to manage.
+        Args:
+            model: Model instance
+            load_model: If True, model will be loaded to Triton server.
+        """
+        key = self._format_key(model)
+        if key in self._models:
+            raise PyTritonInvalidOperationError("Cannot add model with the same name twice.")
+        LOGGER.debug(f"Adding {model.model_name} ({model.model_version}) to registry under {key}.")
+        self._models[key] = model
+        _is_model_store_local = self._model_store_path is not None
+        if _is_model_store_local:
+            model.generate_model(self._model_store_path)
+        if load_model:
+            self._load_model(model, _is_model_store_local)
+            model.setup()
+    def load_models(self) -> None:
+        """Load bound models to Triton server and setup loaded models."""
+        for model in self._models.values():
+            if not model.is_alive():
+                self._load_model(model)
+                model.setup()
+    def setup_models(self) -> None:
+        """Setup loaded models."""
+        for model in self._models.values():
+            if not model.is_alive():
+                model.setup()
+    def clean(self) -> None:
+        """Clean the model and internal registry."""
+        with contextlib.closing(
+            create_client_from_url(self._triton_url, network_timeout_s=CREATE_TRITON_CLIENT_TIMEOUT_S)
+        ) as client:
+            server_live = False
+            try:
+                server_live = client.is_server_live()
+            # TimeoutError and ConnectionRefusedError are derived from OSError so they are redundant here
+            # OSError is raised from gevent/_socketcommon.py:590 sometimes, when server is not ready
+            except (socket.timeout, OSError, InferenceServerException):
+                pass
+            except Exception as ex:
+                LOGGER.error(f"Unexpected exception during server live check: {ex}")
+                raise ex
+            for name, model in self._models.items():
+                LOGGER.debug(f"Clean model {name}.")
+                model.clean()
+                if server_live:
+                    client.unload_model(model.model_name)
+            if server_live:
+                # after unload there is a short period of time when server is not ready
+                wait_for_server_ready(client, timeout_s=DEFAULT_TRITON_STARTUP_TIMEOUT_S)
+        self._models.clear()
+    def _format_key(self, model: Model) -> Tuple[str, int]:
+        key = (model.model_name.lower(), model.model_version)
+        return key
+    def _load_model(self, model: Model, local_model_store=False):
+        """Prepare model config and required files dict and load model to Triton server."""
+        LOGGER.debug(f"Creating model {model.model_name} with version {model.model_version}.")
+        config = None if local_model_store else json.dumps(model.get_model_config())
+        files = None if local_model_store else model.get_proxy_model_files()
+        with ModelClient(
+            url=self._triton_url, model_name=model.model_name, model_version=str(model.model_version)
+        ) as client:
+            client.wait_for_server(timeout_s=DEFAULT_TRITON_STARTUP_TIMEOUT_S)
+            client.load_model(config=config, files=files)
+        LOGGER.debug("Done.")

stf/stf-api-alternative/pytriton/build/lib/pytriton/models/model.py ADDED Viewed

	@@ -0,0 +1,335 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model base class."""
+import base64
+import copy
+import enum
+import json
+import logging
+import os
+import pathlib
+import shutil
+import threading
+import typing
+from typing import Callable, List, Optional, Sequence, Union
+from pytriton.decorators import TritonContext
+from pytriton.exceptions import PyTritonValidationError
+from pytriton.model_config.generator import ModelConfigGenerator
+from pytriton.model_config.model_config import ModelConfig
+from pytriton.model_config.tensor import Tensor
+from pytriton.model_config.triton_model_config import DeviceKind, ResponseCache, TensorSpec, TritonModelConfig
+from pytriton.proxy.communication import get_config_from_handshake_server
+from pytriton.proxy.data import Base64SerializerDeserializer, TensorStoreSerializerDeserializer
+from pytriton.proxy.inference import InferenceHandler, InferenceHandlerEvent, RequestsResponsesConnector
+from pytriton.proxy.validators import TritonResultsValidator
+from pytriton.utils.workspace import Workspace
+LOGGER = logging.getLogger(__name__)
+class ModelEvent(enum.Enum):
+    """Represents model event."""
+    RUNTIME_TERMINATING = "runtime-terminating"
+    RUNTIME_TERMINATED = "runtime-terminated"
+ModelEventsHandler = typing.Callable[["Model", ModelEvent, typing.Optional[typing.Any]], None]
+def _inject_triton_context(triton_context: TritonContext, model_callable: Callable) -> Callable:
+    """Inject triton context into callable.
+    Args:
+        triton_context: Triton context
+        model_callable: Callable to inject triton context
+    Returns:
+        Callable with injected triton context
+    """
+    if hasattr(model_callable, "__self__"):
+        model_callable.__self__.__triton_context__ = triton_context
+    else:
+        model_callable.__triton_context__ = triton_context
+    return model_callable
+class Model:
+    """Model definition."""
+    SCRIPT_FILES_TO_COPY = ["communication.py", "data.py", "model.py", "types.py", "telemetry.py"]
+    def __init__(
+        self,
+        model_name: str,
+        model_version: int,
+        inference_fn: Union[Callable, Sequence[Callable]],
+        inputs: Sequence[Tensor],
+        outputs: Sequence[Tensor],
+        config: ModelConfig,
+        workspace: Workspace,
+        triton_context: TritonContext,
+        strict: bool,
+        trace_config: Optional[List[str]] = None,
+    ):
+        """Create Python model with required data.
+        Args:
+            model_name: Model name
+            model_version: Model version
+            inference_fn: Inference handler (function or lambda)
+            inputs: Model inputs definition
+            outputs: Model outputs definition
+            config: model configuration parameters
+            workspace: workspace for storing artifacts
+            triton_context: Triton context
+            strict: Enable strict validation of model outputs
+            trace_config: List of trace config parameters
+        Raises:
+            PyTritonValidationError if one or more of provided values are incorrect.
+        """
+        self.triton_context = triton_context
+        self.model_name = model_name
+        self.model_version = model_version
+        self._inference_handlers_lock = threading.Lock()
+        self._inference_handlers = []
+        self._requests_respones_connectors = []
+        self._observers_lock = threading.Lock()
+        self._strict = strict
+        self._trace_config = trace_config
+        self.infer_functions = [inference_fn] if isinstance(inference_fn, Callable) else inference_fn
+        if not isinstance(self.infer_functions, (Sequence, Callable)):
+            raise PyTritonValidationError("inference_fn has to be either callable or sequence of callables")
+        self.inputs = inputs
+        self.outputs = outputs
+        if any(output.optional for output in self.outputs):
+            raise PyTritonValidationError("Output tensors cannot be optional.")
+        self.config = config
+        self._workspace = workspace
+        if os.environ.get("PYTRITON_NO_TENSORSTORE"):
+            self._serializer_deserializer = Base64SerializerDeserializer()
+        else:
+            self._serializer_deserializer = TensorStoreSerializerDeserializer()
+        self._triton_model_config: Optional[TritonModelConfig] = None
+        self._model_events_observers: typing.List[ModelEventsHandler] = []
+    def get_model_config(self) -> dict:
+        """Get model config.
+        Returns:
+            Dictionary with model config
+        """
+        triton_model_config = self._get_triton_model_config()
+        generator = ModelConfigGenerator(config=triton_model_config)
+        return generator.get_config()
+    def get_proxy_model_files(self) -> typing.Dict[str, bytes]:
+        """Get proxy model files.
+        Returns:
+            Dictionary with model files to be copied to Triton model store on server side:
+                key: file path in following format - 'file:{model_version}/{file_name}'
+                value: file content as bytes
+        """
+        proxy_model_files_dict = {}
+        proxy_path = pathlib.Path(__file__).parent.parent / "proxy"
+        for file_to_copy in self.SCRIPT_FILES_TO_COPY:
+            src_file_path = proxy_path / file_to_copy
+            with open(src_file_path, "rb") as f:
+                src_file = f.read()
+                proxy_model_files_dict[f"file:{self.model_version}/{file_to_copy}"] = src_file
+        return proxy_model_files_dict
+    def generate_model(self, model_repository: pathlib.Path) -> None:
+        """Generate model and its config in the model repository.
+        Args:
+            model_repository: Path to Triton model repository
+        Raises:
+            OSError: when model repository not exists
+        """
+        LOGGER.debug(
+            f"Generating model and config for {self.model_name} and {self.model_version} to {model_repository}"
+        )
+        model_catalog = model_repository / self.model_name
+        config_file_path = model_catalog / "config.pbtxt"
+        if config_file_path.exists():
+            LOGGER.warning(f"The config file {config_file_path} is going to be overridden.")
+        triton_model_config = self._get_triton_model_config()
+        generator = ModelConfigGenerator(config=triton_model_config)
+        generator.to_file(config_file_path)
+        model_version_catalog = model_catalog / str(self.model_version)
+        model_version_catalog.mkdir(exist_ok=True, parents=True)
+        proxy_path = pathlib.Path(__file__).parent.parent / "proxy"
+        for script_file in self.SCRIPT_FILES_TO_COPY:
+            src_file_path = proxy_path / script_file
+            dst_file_path = model_version_catalog / script_file
+            shutil.copy(src_file_path, dst_file_path)
+    def setup(self) -> None:
+        """Create deployments and bindings to Triton Inference Server."""
+        with self._inference_handlers_lock:
+            if not self._inference_handlers:
+                triton_model_config = self._get_triton_model_config()
+                workspace_path = pathlib.Path(triton_model_config.backend_parameters["workspace-path"])
+                validator = TritonResultsValidator(triton_model_config, self._strict)
+                inference_handler_config_path = workspace_path / f"{self.model_name}-config.sock"
+                inference_handler_config = get_config_from_handshake_server(inference_handler_config_path)
+                data_socket = pathlib.Path(inference_handler_config["data_socket"])
+                authkey = base64.decodebytes(inference_handler_config["authkey"].encode("ascii"))
+                self._serializer_deserializer.connect(data_socket.as_posix(), authkey)
+                for i, infer_function in enumerate(self.infer_functions):
+                    self.triton_context.model_configs[infer_function] = copy.deepcopy(triton_model_config)
+                    _inject_triton_context(self.triton_context, infer_function)
+                    request_server_socket = workspace_path / f"{self.model_name}_0_{i}-server.sock"
+                    request_server_socket = f"ipc://{request_server_socket.as_posix()}"
+                    requests_respones_connector = RequestsResponsesConnector(
+                        url=request_server_socket,
+                        serializer_deserializer=self._serializer_deserializer,
+                    )
+                    requests_respones_connector.start()
+                    self._requests_respones_connectors.append(requests_respones_connector)
+                    inference_handler = InferenceHandler(
+                        model_callable=infer_function,
+                        requests_responses_connector=requests_respones_connector,
+                        validator=validator,
+                        name=f"inference_handler-{i}",
+                    )
+                    inference_handler.on_inference_handler_event(self._on_inference_handler_event)
+                    inference_handler.start()
+                    self._inference_handlers.append(inference_handler)
+    def clean(self) -> None:
+        """Post unload actions to perform on model."""
+        with self._observers_lock:
+            LOGGER.debug("Clearing model events observers")
+            self._model_events_observers.clear()
+        LOGGER.debug("Socket closed. Waiting for inference handler and communication threads to shut down")
+        with self._inference_handlers_lock:
+            for inference_handler in self._inference_handlers:
+                inference_handler.stop()
+            for inference_handler in self._inference_handlers:
+                inference_handler.join()
+            self._inference_handlers.clear()
+            for requests_responses_connector in self._requests_respones_connectors:
+                requests_responses_connector.close()
+            for requests_responses_connector in self._requests_respones_connectors:
+                requests_responses_connector.join()
+            self._requests_respones_connectors.clear()
+        self._serializer_deserializer.close()
+    def is_alive(self) -> bool:
+        """Validate if model is working on Triton.
+        If model is fully loaded by Triton, return True. Otherwise, perform a custom verification.
+        Returns:
+            True if model is working, False otherwise
+        """
+        with self._inference_handlers_lock:
+            return (
+                bool(self._inference_handlers)
+                and bool(self._requests_respones_connectors)
+                and all(inference_handler.is_alive() for inference_handler in self._inference_handlers)
+                and all(
+                    requests_responses_connector.is_alive()
+                    for requests_responses_connector in self._requests_respones_connectors
+                )
+            )
+    def _get_triton_model_config(self) -> TritonModelConfig:
+        """Generate ModelConfig from descriptor and custom arguments for Python model.
+        Returns:
+            ModelConfig object with configuration for Python model deployment
+        """
+        if not self._triton_model_config:
+            backend_parameters = {"workspace-path": self._workspace.path.as_posix()}
+            if self._trace_config:
+                backend_parameters["trace-config"] = base64.b64encode(json.dumps(self._trace_config).encode()).decode()
+            triton_model_config = TritonModelConfig(
+                model_name=self.model_name,
+                model_version=self.model_version,
+                batching=self.config.batching,
+                batcher=self.config.batcher,
+                max_batch_size=self.config.max_batch_size,
+                decoupled=self.config.decoupled,
+                backend_parameters=backend_parameters,
+                instance_group={DeviceKind.KIND_CPU: len(self.infer_functions)},
+            )
+            inputs = []
+            for idx, input_spec in enumerate(self.inputs, start=1):
+                input_name = input_spec.name if input_spec.name else f"INPUT_{idx}"
+                tensor = TensorSpec(
+                    name=input_name, dtype=input_spec.dtype, shape=input_spec.shape, optional=input_spec.optional
+                )
+                inputs.append(tensor)
+            outputs = []
+            for idx, output_spec in enumerate(self.outputs, start=1):
+                output_name = output_spec.name if output_spec.name else f"OUTPUT_{idx}"
+                tensor = TensorSpec(name=output_name, dtype=output_spec.dtype, shape=output_spec.shape)
+                outputs.append(tensor)
+            triton_model_config.inputs = inputs
+            triton_model_config.outputs = outputs
+            if self.config.response_cache:
+                triton_model_config.response_cache = ResponseCache(enable=True)
+            self._triton_model_config = triton_model_config
+        return self._triton_model_config
+    def on_model_event(self, model_event_handle_fn: ModelEventsHandler):
+        """Register ModelEventsHandler callable.
+        Args:
+            model_event_handle_fn: function to be called when model events arises
+        """
+        with self._observers_lock:
+            self._model_events_observers.append(model_event_handle_fn)
+    def _notify_model_events_observers(self, event: ModelEvent, context: typing.Any):
+        with self._observers_lock:
+            for model_event_handle_fn in self._model_events_observers:
+                model_event_handle_fn(self, event, context)
+    def _on_inference_handler_event(
+        self, proxy_backend: InferenceHandler, event: InferenceHandlerEvent, context: typing.Optional[typing.Any] = None
+    ):
+        if event in [InferenceHandlerEvent.CLOSING, InferenceHandlerEvent.UNRECOVERABLE_ERROR]:
+            self._notify_model_events_observers(ModelEvent.RUNTIME_TERMINATING, context)
+        elif event == InferenceHandlerEvent.CLOSED:
+            self._notify_model_events_observers(ModelEvent.RUNTIME_TERMINATED, context)

stf/stf-api-alternative/pytriton/build/lib/pytriton/proxy/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104

stf/stf-api-alternative/pytriton/build/lib/pytriton/proxy/communication.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module handling communication between RequestsServer and RequestsServerClients."""
+import asyncio
+import enum
+import functools
+import json
+import logging
+import pathlib
+import socket
+import threading
+import time
+import traceback
+import typing
+import uuid
+from concurrent.futures import Future as ConcurrentFuture
+import zmq  # pytype: disable=import-error
+import zmq.asyncio  # pytype: disable=import-error
+LOGGER = logging.getLogger(__name__)
+SERVER_LOGGER = LOGGER.getChild("server")
+CLIENT_LOGGER = LOGGER.getChild("client")
+_STARTUP_TIMEOUT_S = 1.0
+class PyTritonResponseFlags(enum.IntFlag):
+    """Response flags for PyTritonInferenceHandler."""
+    EOS = enum.auto()  # End Of Stream
+    ERROR = enum.auto()
+class _RequestsServerState(enum.Enum):
+    STOPPED = enum.auto()
+    STARTING = enum.auto()
+    STARTED = enum.auto()
+    STOPPING = enum.auto()
+def _set_current_task_name(name: str):
+    current_task = asyncio.current_task()
+    if current_task is not None:
+        current_task.set_name(name)
+_RequestScope = typing.Dict[str, typing.Any]
+_HandleRequestsCoro = typing.Callable[[_RequestScope, bytes, zmq.asyncio.Socket], typing.Awaitable[typing.Any]]
+HandleResponsesCoro = typing.Callable[[_RequestScope, asyncio.Queue, ConcurrentFuture], typing.Awaitable[typing.Any]]
+class RequestsServer:
+    """Class for serving available inference requests and passing inference responses."""
+    def __init__(self, url: str, handle_responses_fn: HandleResponsesCoro):
+        """Initialize RequestsServer.
+        Args:
+            url: url to bind socket
+            handle_responses_fn: couroutine handling responses from InferenceHandler
+        """
+        self._url = url
+        self._handle_responses_fn = handle_responses_fn
+        self._state = _RequestsServerState.STOPPED
+        self._state_condition = threading.Condition()
+        self._shutdown_event = asyncio.Event()  # TODO: is it still required having condition?
+        self._server_loop = None
+        # requests_id -> results asyncio.Queue map
+        self._responses_queues: typing.Dict[bytes, asyncio.Queue] = {}
+        self._handle_responses_tasks: typing.Dict[bytes, asyncio.Task] = {}
+    def run(self):
+        """Run RequestsServer.
+        It stops when handle_messages coroutine finishes.
+        Raises:
+            RuntimeError: if RequestsServer is already running
+        """
+        with self._state_condition:
+            if self._state != _RequestsServerState.STOPPED:
+                raise RuntimeError(f"Cannot run {type(self).__name__} as it is already running")
+            self._state = _RequestsServerState.STARTING
+            self._state_condition.notify_all()
+        assert len(self._responses_queues) == 0
+        assert len(self._handle_responses_tasks) == 0
+        asyncio.run(self.handle_messages())
+    @property
+    def server_loop(self) -> typing.Optional[asyncio.AbstractEventLoop]:
+        """Get asyncio loop for RequestsServer.
+        Returns:
+            asyncio.AbstractEventLoop: asyncio loop for RequestsServer or None if server is not started yet
+        """
+        return self._server_loop
+    def wait_till_running(self):
+        """Wait till RequestsServer is running.
+        Raises:
+            RuntimeError: if RequestsServer is shutting down or not launched yet
+        """
+        with self._state_condition:
+            if self._state == _RequestsServerState.STARTING:
+                self._state_condition.wait_for(
+                    lambda: self._state == _RequestsServerState.STARTED, timeout=_STARTUP_TIMEOUT_S
+                )
+            elif self._state == _RequestsServerState.STOPPED:
+                raise RuntimeError("Cannot push requests before RequestsServer is started")
+            elif self._state == _RequestsServerState.STOPPING:
+                raise RuntimeError(f"Cannot push requests while {type(self).__name__} is shutting down")
+    async def handle_messages(self):
+        """Coroutine for handling messages from InferenceHandler."""
+        self._server_loop = asyncio.get_running_loop()
+        try:
+            SERVER_LOGGER.debug(f"Binding socket to url='{self._url}'")
+            self._zmq_context = zmq.asyncio.Context()
+            self._socket = self._zmq_context.socket(zmq.DEALER)
+            self._socket.bind(self._url)
+        except (TypeError, zmq.error.ZMQError) as e:
+            raise ValueError(
+                f"Error occurred during binding socket to url='{self._url}' (e: {e})." "RequestsServer will be closed."
+            ) from e
+        _set_current_task_name("handle_messages")
+        with self._state_condition:
+            if self._state != _RequestsServerState.STARTING:
+                self._state = _RequestsServerState.STOPPED
+                self._state_condition.notify_all()
+                raise RuntimeError(f"Cannot start {type(self).__name__} as it is not in STARTING state")
+            self._state = _RequestsServerState.STARTED
+            self._state_condition.notify_all()
+        def _all_responses_processed():
+            return not any([self._handle_responses_tasks, self._responses_queues])
+        try:
+            flag_check_interval_s = 1.0
+            # have to receive mssages untill all requestss to be processed, despite shutdown event is set
+            while not self._shutdown_event.is_set() or not _all_responses_processed():
+                requests_id = b"<unknown>"
+                try:
+                    requests_id, flags, responses_payload = await asyncio.wait_for(
+                        self._socket.recv_multipart(), flag_check_interval_s
+                    )
+                    flags = int.from_bytes(flags, byteorder="big")
+                    responses_queue = self._responses_queues[requests_id]
+                    responses_queue.put_nowait((flags, responses_payload))  # queue have no max_size
+                except asyncio.TimeoutError:
+                    continue
+                except KeyError:
+                    SERVER_LOGGER.warning(f"Received response for unknown requests {requests_id.hex()}. Ignoring it.")
+                except asyncio.CancelledError:
+                    SERVER_LOGGER.info("Received CancelledError")
+                    self._shutdown_event.set()
+        finally:
+            # Received all responses, close socket
+            SERVER_LOGGER.debug("Closing socket")
+            try:
+                if self._socket is not None:
+                    self._socket.close(linger=0)
+                    self._socket = None
+            except zmq.error.ZMQError as e:
+                SERVER_LOGGER.error(f"Error occurred during closing socket (e: {e}).")
+            try:
+                if self._zmq_context is not None:
+                    self._zmq_context.term()
+                    self._zmq_context = None
+            except zmq.error.ZMQError as e:
+                SERVER_LOGGER.error(f"Error occurred during closing zmq context (e: {e}).")
+            self._server_loop = None
+            with self._state_condition:
+                self._state = _RequestsServerState.STOPPED
+                self._state_condition.notify_all()
+            SERVER_LOGGER.debug("Socket for handle_messages task closed")
+            self._shutdown_event.clear()
+            SERVER_LOGGER.debug(f"Leaving handle_messages task from {type(self).__name__}")
+    def shutdown(self):
+        """Close RequestsServer.
+        Don't wait for handle_messages coroutine to finish.
+        """
+        SERVER_LOGGER.debug("Closing RequestsServer")
+        with self._state_condition:
+            self._state = _RequestsServerState.STOPPING
+            self._state_condition.notify_all()
+        self._shutdown_event.set()
+    async def send_requests(
+        self, requests_id: bytes, requests_payload: bytes, responses_future: ConcurrentFuture
+    ) -> asyncio.Task:
+        """Send requests to InferenceHandler.
+        Args:
+            requests_id: id of requests
+            requests_payload: payload of requests
+            responses_future: future for waiting in another thread
+        Returns:
+            asyncio.Task: task handling responses from InferenceHandler
+        Raises:
+            RuntimeError: if RequestsServer is shutting down or requests_id is already pending
+        """
+        if self._shutdown_event.is_set():
+            SERVER_LOGGER.debug(f"Cannot send requests while {type(self).__name__} is {self._state.name}")
+            raise RuntimeError(f"Cannot send requests while {type(self).__name__} is {self._state.name}")
+        if requests_id in self._responses_queues or requests_id in self._handle_responses_tasks:
+            SERVER_LOGGER.debug(f"Cannot send requests with id {requests_id.hex()} as such id is already pending")
+            raise RuntimeError(f"Cannot send requests with id {requests_id.hex()} as such id is already pending")
+        _set_current_task_name(f"send_requests-{requests_id.hex()}")
+        self._responses_queues[requests_id] = asyncio.Queue()
+        scope = {"requests_id": requests_id}
+        handle_responses_task = self._server_loop.create_task(
+            self._handle_responses(scope, self._responses_queues[requests_id], responses_future),
+            name=f"handle_responses-{requests_id.hex()}",
+        )
+        self._handle_responses_tasks[requests_id] = handle_responses_task
+        # FIXME: check if can not copy buffers; in case copy=False send_multipart returns MessageTracker
+        #       https://pyzmq.readthedocs.io/en/latest/api/zmq.html#zmq.Socket.send_multipart
+        #       consider send_pyobject|send_serialized (but it is not multipart)
+        # sending in same loop, thus thread as handle_messages
+        # send_multipart doesn't return anything, as it copies requests_payload
+        await self._socket.send_multipart([requests_id, requests_payload])
+        return handle_responses_task
+    async def _handle_responses(self, scope, responses_queue: asyncio.Queue, responses_future: ConcurrentFuture):
+        """Handle responses from InferenceHandler.
+        Args:
+            scope: scope for handling responses
+            responses_queue: queue with responses payload from InferenceHandler
+            responses_future: future for waiting in another thread
+        """
+        requests_id = scope["requests_id"]
+        try:
+            return await self._handle_responses_fn(scope, responses_queue, responses_future)
+        finally:
+            self._responses_queues.pop(requests_id)
+            self._handle_responses_tasks.pop(requests_id)
+class RequestsServerClient:
+    """RequestsServer client for handling requests from RequestsServer and sending back responses."""
+    def __init__(self, url: str, handle_requests_fn: _HandleRequestsCoro, name: typing.Optional[str] = None):
+        """Initialize RequestsServerClient.
+        Args:
+            url: url to connect socket
+            handle_requests_fn: couroutine handling requests from InferenceHandler
+            name: name of RequestsServerClient
+        """
+        self._shutdown_event = asyncio.Event()
+        self._url = url
+        self._handle_requests_fn = handle_requests_fn
+        self._handle_requests_tasks: typing.Dict[bytes, asyncio.Task] = {}
+        self._handle_requests_tasks_condition = asyncio.Condition()
+        self._name = name or f"requests_server_client-{uuid.uuid4().hex[-4:]}"
+        self._loop = None
+    def run(self):
+        """Run RequestsServerClient.
+        It stops when handle_requests coroutine finishes.
+        """
+        asyncio.run(self.handle_requests())
+    def shutdown(self) -> None:
+        """Close RequestsServerClient.
+        Don't wait for handle_requests coroutine to finish.
+        """
+        CLIENT_LOGGER.debug(f"Closing {type(self).__name__} {self._name}")
+        self._shutdown_event.set()
+    async def handle_requests(self):
+        """Coroutine for handling requests from RequestsServer."""
+        name = self._name
+        _set_current_task_name(name)
+        zmq_context = None
+        socket = None
+        self._loop = asyncio.get_running_loop()
+        try:
+            CLIENT_LOGGER.debug(f"Connecting {name} to server listening on {self._url}")
+            zmq_context = zmq.asyncio.Context()
+            socket = zmq_context.socket(zmq.DEALER)
+            socket.connect(self._url)
+            send = functools.partial(self._send, socket)
+            flag_check_interval_s = 1.0
+            while True:
+                try:
+                    requests_id, requests_payloads = await asyncio.wait_for(
+                        socket.recv_multipart(), flag_check_interval_s
+                    )
+                    scope = {"requests_id": requests_id}
+                    CLIENT_LOGGER.debug(f"{requests_id.hex()} received requests")
+                    handle_requests_task = self._loop.create_task(self._handle_requests(scope, requests_payloads, send))
+                    self._handle_requests_tasks[requests_id] = handle_requests_task
+                    handle_requests_task.set_name(f"handle_requests-{requests_id.hex()}")
+                except asyncio.TimeoutError:
+                    if self._shutdown_event.is_set():
+                        break
+                    continue
+            CLIENT_LOGGER.debug("Waiting for handle_requests tasks to finish")
+            async with self._handle_requests_tasks_condition:
+                await self._handle_requests_tasks_condition.wait_for(lambda: len(self._handle_requests_tasks) == 0)
+            CLIENT_LOGGER.debug("All handle_requests tasks finished")
+        except zmq.error.ZMQError:
+            CLIENT_LOGGER.exception(
+                "Connection error occurred during reading requests. " f"{type(self).__name__} will be closed."
+            )
+            self._shutdown_event.set()
+        except Exception:
+            CLIENT_LOGGER.exception(f"Internal {type(self).__name__}. " f"{type(self).__name__} will be closed.")
+            self._shutdown_event.set()
+        finally:
+            try:
+                socket_close_timeout_ms = 0  # immediate close (drop not sent messages)
+                if socket is not None:
+                    socket.close(linger=socket_close_timeout_ms)
+            except zmq.error.ZMQError as e:
+                CLIENT_LOGGER.error(f"Error occurred during closing socket (e: {e}).")
+            try:
+                if zmq_context is not None:
+                    zmq_context.term()
+            except zmq.error.ZMQError as e:
+                CLIENT_LOGGER.error(f"Error occurred during closing zmq context (e: {e}).")
+            CLIENT_LOGGER.debug(f"Socket for {name} closed")
+            self._shutdown_event.clear()
+            self._loop = None
+            CLIENT_LOGGER.debug(f"Leaving {name}")
+    @property
+    def name(self) -> str:
+        """Get name of RequestsServerClient.
+        Returns:
+            name of RequestsServerClient
+        """
+        return self._name
+    @property
+    def loop(self) -> asyncio.AbstractEventLoop:
+        """Get asyncio loop for RequestsServerClient.
+        Returns:
+            asyncio.AbstractEventLoop: asyncio loop for RequestsServerClient
+        """
+        return self._loop
+    async def _handle_requests(self, scope, requests_payload, send):
+        try:
+            await self._handle_requests_fn(scope, requests_payload, send)
+        # except PyTritonUnrecoverableError:
+        #     error = traceback.format_exc()
+        #     responses = InferenceHandlerResponses(error=error)
+        #     CLIENT_LOGGER.error(
+        #         "Unrecoverable error thrown during calling model callable. "
+        #         "Shutting down Triton Inference Server. "
+        #         f"{error}"
+        #     )
+        #     self.stopped = True
+        #     self._notify_proxy_backend_observers(InferenceHandlerEvent.UNRECOVERABLE_ERROR, error)
+        #     CLIENT_LOGGER.debug(f"Send response to proxy model for {model_name}.")
+        #     send(responses.as_bytes())
+        except Exception:
+            error = traceback.format_exc()
+            flags = PyTritonResponseFlags.ERROR | PyTritonResponseFlags.EOS
+            await send(scope, flags, error.encode())
+            CLIENT_LOGGER.error(f"Error occurred during handling requests {scope['requests_id'].hex()}\n{error}")
+        finally:
+            async with self._handle_requests_tasks_condition:
+                self._handle_requests_tasks.pop(scope["requests_id"], None)
+                self._handle_requests_tasks_condition.notify()
+            CLIENT_LOGGER.debug(f"Finished handling requests {scope['requests_id'].hex()}")
+    async def _send(self, socket, scope, flags, requests_payload):
+        """Send requests to RequestsServer.
+        Args:
+            socket: socket for sending requests
+            scope: scope for sending requests
+            flags: flags for sending requests
+            requests_payload: payload of requests
+        """
+        flags = flags.to_bytes(1, "big")
+        await socket.send_multipart([scope["requests_id"], flags, requests_payload])
+class HandshakeServer(threading.Thread):
+    """Handshake server for passing config."""
+    def __init__(self, socket_path: pathlib.Path, inference_handler_config) -> None:
+        """Initialize HandshakeServer.
+        Args:
+            socket_path: path to socket
+            inference_handler_config: config for InferenceHandler
+        """
+        super().__init__(daemon=True, name="handshake-server")
+        self._socket_path = socket_path
+        try:
+            self._config_payload = json.dumps(inference_handler_config).encode()
+        except TypeError:
+            raise ValueError(f"InferenceHandler config is not serializable: {inference_handler_config}") from None
+        self._server = None
+        self._error_from_thread = None
+    def start(self):
+        """Start HandshakeServer.
+        Raises:
+            RuntimeError: if HandshakeServer is already running or error occurred during starting
+        """
+        if self._server:
+            raise RuntimeError("HandshakeServer is already running")
+        super().start()
+        while self._server is None and not self._error_from_thread:
+            time.sleep(0.001)
+        if self._error_from_thread is not None:
+            raise self._error_from_thread
+    def run(self):
+        """Run HandshakeServer."""
+        asyncio.run(self._run())
+    async def _run(self):
+        try:
+            self._server = await asyncio.start_unix_server(self._handle_request, self._socket_path)
+            async with self._server:
+                try:
+                    await self._server.serve_forever()
+                except asyncio.CancelledError:
+                    pass
+        except Exception as e:
+            SERVER_LOGGER.error(f"Error occurred during running handshake server (e: {e})")
+            self._error_from_thread = e
+    def close(self):
+        """Close HandshakeServer."""
+        loop = self._server.get_loop()
+        loop_tasks = asyncio.all_tasks(loop=loop)
+        for task in loop_tasks:
+            loop.call_soon_threadsafe(task.cancel)
+        self.join()
+        SERVER_LOGGER.debug("Closed handshake server")
+    async def _handle_request(self, reader, writer):
+        peername = writer.get_extra_info("peername")
+        try:
+            request_name = await asyncio.wait_for(reader.readuntil(b"\n"), timeout=1.0)
+            if request_name == b"get_config\n":
+                writer.write(len(self._config_payload).to_bytes(4, "big"))
+                writer.write(self._config_payload)
+                await writer.drain()
+            else:
+                SERVER_LOGGER.warning(f"Unknown request {request_name} from {peername}")
+        except asyncio.TimeoutError:
+            SERVER_LOGGER.debug(f"Timeout occurred during handling request from {peername}")
+        except Exception as e:
+            SERVER_LOGGER.error(f"Error occurred during handling request from {peername} (e: {e})")
+        finally:
+            writer.close()
+            await writer.wait_closed()
+def get_config_from_handshake_server(socket_path: pathlib.Path, timeout_s: float = 1.0) -> dict:
+    """Get config from handshake server.
+    Args:
+        socket_path: path to socket
+        timeout_s: timeout for waiting for the response
+    Returns:
+        config from handshake server
+    Raises:
+        TimeoutError: if timeout occurred while waiting for the response
+        ValueError: if invalid JSON response from the server
+    """
+    should_stop_before_s = time.time() + timeout_s
+    sock = None
+    try:
+        LOGGER.debug(f"Waiting for config file {socket_path}")
+        while not socket_path.exists() and time.time() < should_stop_before_s:
+            time.sleep(0.001)
+        if not socket_path.exists():
+            raise TimeoutError(f"Timeout occurred while waiting for config file {socket_path}")
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        sock.settimeout(max(0.0, should_stop_before_s - time.time()))
+        sock.connect(socket_path.as_posix())
+        sock.sendall(b"get_config\n")
+        sock.settimeout(max(0.0, should_stop_before_s - time.time()))
+        payload_size = sock.recv(4)
+        payload_size = int.from_bytes(payload_size, "big")
+        sock.settimeout(max(0.0, should_stop_before_s - time.time()))
+        config_payload = sock.recv(payload_size)
+        config = json.loads(config_payload)
+        return config
+    except socket.timeout as e:
+        raise TimeoutError(f"Timeout occurred while waiting for config file {socket_path}") from e
+    except json.JSONDecodeError as e:
+        raise ValueError("Invalid JSON response from the server.") from e
+    finally:
+        if sock is not None:
+            sock.close()

stf/stf-api-alternative/pytriton/build/lib/pytriton/proxy/data.py ADDED Viewed

	@@ -0,0 +1,1133 @@

+# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Communication utility module.
+It is used for interaction between model and proxy_backend.
+"""
+import abc
+import atexit
+import base64
+import ctypes
+import ctypes.util
+import dataclasses
+import fcntl
+import gc
+import json
+import logging
+import math
+import multiprocessing.managers
+import multiprocessing.popen_spawn_posix
+import multiprocessing.shared_memory
+import os
+import pathlib
+import signal
+import struct
+import threading
+import time
+import uuid
+import weakref
+from typing import Dict, List, Literal, Optional, Sequence, Tuple, Union
+import numpy as np
+from .telemetry import get_span_dict, start_span_from_remote
+from .types import Request, Requests, Response, Responses
+LOGGER = logging.getLogger(__name__)
+PROTOCOL_VERSION = "3"
+# copy from
+# https://github.com/triton-inference-server/python_backend/blob/main/src/resources/triton_python_backend_utils.py
+def _serialize_byte_tensor(tensor) -> bytes:
+    """Serializes a bytes tensor into a flat numpy array of length prepended bytes.
+    The numpy array should use dtype of np.object_. For np.bytes_,
+    numpy will remove trailing zeros at the end of byte sequence and because
+    of this it should be avoided.
+    Args:
+        tensor: The bytes tensor to serialize.
+    Returns:
+    serialized array as bytes buffer.
+    Raises:
+        UnicodeEncodeErrors: raised when try to cast to string of non-bytes items fails
+    """
+    if tensor.size == 0:
+        return b""
+    # If the input is a tensor of string/bytes objects, then must flatten those
+    # into a 1-dimensional array containing the 4-byte byte size followed by the
+    # actual element bytes. All elements are concatenated together in "C" order.
+    assert (tensor.dtype == np.object_) or (tensor.dtype.type == np.bytes_)
+    flattened_ls = []
+    total_len = 0
+    for obj in np.nditer(tensor, flags=["refs_ok"], order="C"):
+        # If directly passing bytes to BYTES type,
+        # don't convert it to str as Python will encode the
+        # bytes which may distort the meaning
+        if tensor.dtype == np.object_ and not isinstance(obj.item(), bytes):
+            s = str(obj.item()).encode("utf-8")
+        else:
+            s = obj.item()
+        item_len = len(s)
+        flattened_ls.append(struct.pack("<I", item_len))
+        flattened_ls.append(s)
+        total_len += struct.calcsize("<I") + item_len
+    flattened_ls.insert(0, struct.pack("<I", total_len))
+    flattened = b"".join(flattened_ls)
+    return flattened
+# copy from
+# https://github.com/triton-inference-server/python_backend/blob/main/src/resources/triton_python_backend_utils.py
+def _deserialize_bytes_tensor(encoded_tensor, dtype, order: Literal["C", "F"] = "C") -> np.ndarray:
+    """Deserializes an encoded bytes tensor into an numpy array of dtype of python objects.
+    Args:
+        encoded_tensor : The encoded bytes tensor where each element has its length in
+        first 4 bytes followed by the content
+        dtype: The dtype of the numpy array to deserialize to.
+        order: The order of the numpy array to deserialize to.
+    Returns:
+    The 1-D numpy array of type object containing the deserialized bytes in 'C' order.
+    """
+    strs = []
+    offset = 0
+    val_buf = encoded_tensor
+    val_len = struct.unpack_from("<I", val_buf, offset)[0] + 4
+    offset += 4
+    while offset < val_len:
+        item_length = struct.unpack_from("<I", val_buf, offset)[0]
+        offset += 4
+        item = struct.unpack_from(f"<{item_length}s", val_buf, offset)[0]
+        offset += item_length
+        strs.append(item)
+    return np.array(strs, dtype=dtype, order=order)
+_MAX_DTYPE_DESCR = 16  # up to 16 chars in dtype descr; |S2147483647 (2^31-1) with margin
+_PARTIAL_HEADER_FORMAT = f"<{_MAX_DTYPE_DESCR}scH"
+def _pack_header(shape: Tuple[int, ...], dtype: np.dtype, order: Literal["C", "F"] = "C") -> bytes:
+    header_format = _PARTIAL_HEADER_FORMAT + "Q" * len(shape)
+    dtype_descr = np.lib.format.dtype_to_descr(dtype)
+    assert (
+        len(dtype_descr) <= _MAX_DTYPE_DESCR
+    ), f"dtype descr is too long; dtype_descr={dtype_descr} max={_MAX_DTYPE_DESCR}"
+    return struct.pack(header_format, dtype_descr.encode("utf-8"), order.encode("ascii"), len(shape), *shape)
+def _unpack_header(header: bytes) -> Tuple[Tuple[int, ...], np.dtype, Literal["C", "F"]]:
+    shape_offset = struct.calcsize(_PARTIAL_HEADER_FORMAT)
+    dtype_descr, order, ndim = struct.unpack_from(_PARTIAL_HEADER_FORMAT, header, offset=0)
+    shape = struct.unpack_from("Q" * ndim, header, offset=shape_offset)
+    dtype = np.lib.format.descr_to_dtype(dtype_descr.decode("utf-8").rstrip("\x00"))
+    order = order.decode("ascii")
+    return shape, dtype, order
+def serialize_numpy_with_struct_header(tensor: np.ndarray) -> List[Union[bytes, memoryview]]:
+    """Serialize numpy array to list of bytes and memoryviews.
+    Args:
+        tensor: numpy array to serialize
+    Returns:
+        List of data frames in form of bytes and memoryviews
+    """
+    if tensor.dtype.hasobject:
+        data = _serialize_byte_tensor(tensor.ravel())
+        order = "C"  # as _serialize_byte_tensor returns C-ordered array
+    else:
+        if not tensor.data.contiguous:
+            tensor = np.ascontiguousarray(tensor)
+        data = tensor.data
+        order = "C" if tensor.flags.c_contiguous else "F"
+    header = _pack_header(tensor.shape, tensor.dtype, order)
+    frames = [header, data]
+    return frames
+def deserialize_numpy_with_struct_header(frames: List[Union[bytes, memoryview]]) -> np.ndarray:
+    """Deserialize numpy array from list of bytes and memoryviews.
+    Args:
+        frames: List of data frames in form of bytes and memoryviews
+    Returns:
+        numpy array
+    """
+    header, data = frames
+    shape, dtype, order = _unpack_header(header)
+    if dtype.hasobject:
+        tensor = _deserialize_bytes_tensor(data, dtype).reshape(shape)
+    else:
+        tensor = np.ndarray(shape, dtype=dtype, buffer=data, order=order)
+    return tensor
+def calc_serialized_size_of_numpy_with_struct_header(tensor: np.ndarray) -> List[int]:
+    """Calculate size of serialized numpy array.
+    Args:
+        tensor: numpy array to serialize
+    Returns:
+        List of sizes of data frames
+    """
+    header_size = struct.calcsize(_PARTIAL_HEADER_FORMAT) + struct.calcsize("Q") * len(tensor.shape)
+    if tensor.dtype.hasobject:
+        items_sizes = []
+        order = "C" if tensor.flags.c_contiguous else "F"
+        for obj in np.nditer(tensor, flags=["refs_ok"], order=order):
+            if tensor.dtype == np.object_ and not isinstance(obj.item(), bytes):
+                s = str(obj.item()).encode("utf-8")
+            else:
+                s = obj.item()
+            items_sizes.append(len(s))
+        # total_size + for size of each item + each item
+        data_size = struct.calcsize("<I") + struct.calcsize("<I") * len(items_sizes) + sum(items_sizes)
+    else:
+        data_size = tensor.nbytes
+    return [header_size, data_size]
+@dataclasses.dataclass
+class BlockDescriptor:
+    """Descriptor of block in shared memory."""
+    shm_name: str
+    offset: int
+    size: Optional[int] = None
+    def __post_init__(self):
+        """Initialize other attributes."""
+        self.id = f"{self.shm_name}:{self.offset}"
+    @classmethod
+    def from_id(cls, tensor_id: str):
+        """Create BlockDescriptor from dict."""
+        shm_name, offset = tensor_id.split(":")
+        return cls(shm_name, int(offset))
+class _SharedMemorySegment:
+    def __init__(self, size):
+        self.shared_memory = multiprocessing.shared_memory.SharedMemory(create=True, size=size)
+        multiprocessing.util.debug(f"Created {self.shared_memory.name} of size {self.shared_memory.size}")
+        self.used_blocks: List[BlockDescriptor] = []
+        self.used_blocks_lock = threading.RLock()
+        self.free_blocks = [BlockDescriptor(self.shared_memory.name, offset=0, size=size)]
+        self.max_free_block_size = size
+    def _update_free_blocks(self):
+        total_size = self.shared_memory.size
+        free_blocks = []
+        offset = 0
+        with self.used_blocks_lock:
+            # find holes between used blocks
+            for used_block in self.used_blocks:
+                if used_block.offset > offset:
+                    free_blocks.append(
+                        BlockDescriptor(self.shared_memory.name, offset=offset, size=used_block.offset - offset)
+                    )
+                offset = used_block.offset + used_block.size
+        # if tail is free
+        if offset < total_size:
+            free_blocks.append(BlockDescriptor(self.shared_memory.name, offset=offset, size=total_size - offset))
+        self.free_blocks = free_blocks
+        self.max_free_block_size = max(block.size for block in self.free_blocks) if self.free_blocks else 0
+    def __contains__(self, block_id: str) -> bool:
+        with self.used_blocks_lock:
+            return any(block_id == block.id for block in self.used_blocks)  # pytype: disable=attribute-error
+    def __getitem__(self, block_id: str) -> BlockDescriptor:
+        with self.used_blocks_lock:
+            for block in self.used_blocks:
+                if block.id == block_id:  # pytype: disable=attribute-error
+                    return block
+        raise KeyError(f"Block with id {block_id} not found in segment {self.shared_memory.name}")
+    def allocate(self, offset, byte_size):
+        block = BlockDescriptor(self.shared_memory.name, offset=offset, size=byte_size)
+        with self.used_blocks_lock:
+            self.used_blocks.append(block)
+            self.used_blocks.sort(key=lambda block: block.offset)
+            self._update_free_blocks()
+        return block
+    def release(self, block: BlockDescriptor):
+        with self.used_blocks_lock:
+            self.used_blocks.remove(block)
+            self._update_free_blocks()
+class _DataBlocksServer:
+    _instance = None
+    _cnt = 0
+    _minimal_segment_size = 4096  # 4KB
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(self):
+        # WAR: for some reason, the __init__ is called on each create of proxy object
+        if self._cnt == 1:
+            return
+        self._cnt += 1
+        self._id = uuid.uuid4()  # to verify that it is singleton across processes
+        self._segments = []
+        self._segments_lock = threading.RLock()
+        atexit.register(self.close)
+    def get_free_blocks(self, bytes_sizes: Sequence[int]) -> Sequence[str]:
+        tensors_ids = []
+        with self._segments_lock:
+            for byte_size in bytes_sizes:
+                for segment in self._segments:
+                    if segment.max_free_block_size >= byte_size:
+                        for free_block in segment.free_blocks:
+                            if free_block.size >= byte_size:
+                                block = self._allocate_block(segment, free_block.offset, byte_size)
+                                tensors_ids.append(block.id)  # pytype: disable=attribute-error
+                                break
+                    else:
+                        continue  # If no suitable block was found, try the next segment
+                    break  # If a suitable block was found, don't try any more segments
+                else:  # If no suitable block was found in any segment
+                    new_segment_size = int(
+                        max(self._minimal_segment_size, math.pow(2, math.ceil(math.log2(byte_size))))
+                    )
+                    block = self._allocate_block(
+                        self._create_new_segment(new_segment_size), offset=0, byte_size=byte_size
+                    )
+                    tensors_ids.append(block.id)  # pytype: disable=attribute-error
+        return tensors_ids
+    def release_block(self, block_id: str):
+        with self._segments_lock:
+            for segment in self._segments:
+                try:
+                    block = segment[block_id]
+                    segment.release(block)
+                    return
+                except KeyError:
+                    pass
+        raise KeyError(f"Block with id {block_id} not found in server")
+    def _allocate_block(self, segment: _SharedMemorySegment, offset: int, byte_size: int) -> BlockDescriptor:
+        return segment.allocate(offset, byte_size)
+    def _create_new_segment(self, segment_size):
+        segment = _SharedMemorySegment(segment_size)
+        self._segments.append(segment)
+        return segment
+    def get_debug_status(self):
+        return {
+            "server_id": str(self._id),
+            "host_pid": multiprocessing.current_process().pid,
+            "segments": [
+                {
+                    "shared_memory": segment.shared_memory.name,
+                    "used_blocks": [str(block) for block in segment.used_blocks],
+                }
+                for segment in self._segments
+            ],
+        }
+    def close(self):
+        multiprocessing.util.debug(f"Closing server {self._id}")
+        with self._segments_lock:
+            while self._segments:
+                segment = self._segments.pop()
+                multiprocessing.util.debug(f"Closing and delete segment {segment.shared_memory.name}")
+                segment.shared_memory.close()
+                segment.shared_memory.unlink()
+class BlocksStoreManager(multiprocessing.managers.BaseManager):
+    """Remote block store for storing and retrieving numpy arrays in/from shared memory."""
+    @classmethod
+    def _run_server(cls, registry, address, authkey, serializer, writer, initializer=None, initargs=()):
+        PR_SET_PDEATHSIG = 1  # noqa
+        libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
+        libc.prctl(PR_SET_PDEATHSIG, signal.SIGTERM)  # terminate process when parent **thread** dies
+        if bool(os.environ.get("PYTRITON_VIZTRACER")):
+            from viztracer import VizTracer  # type: ignore # pytype: disable=import-error
+            cls._tracer = VizTracer(log_async=True, log_gc=True, tracer_entries=10000000, pid_suffix=True)
+            cls._tracer.register_exit()
+            cls._tracer.start()
+        super()._run_server(
+            registry, address, authkey, serializer, writer, initializer, initargs
+        )  # pytype: disable=attribute-error
+class _DataBlocksServerProxy(multiprocessing.managers.BaseProxy):
+    def release_block(self, /, *args, **kwargs):
+        return self._callmethod("release_block", args, kwargs)
+    def get_free_blocks(self, /, *args, **kwargs):
+        return self._callmethod("get_free_blocks", args, kwargs)
+    def _get_debug_status(self, /, *args, **kwargs):
+        return self._callmethod("get_debug_status", args, kwargs)
+    def close(self, /, *args, **kwargs):
+        return self._callmethod("close", args, kwargs)
+BlocksStoreManager.register("blocks", _DataBlocksServer, proxytype=_DataBlocksServerProxy)
+class _FileLock:
+    _locks = {}
+    def __new__(cls, file_path):
+        if file_path not in cls._locks:
+            cls._locks[file_path] = super().__new__(cls)
+        return cls._locks[file_path]
+    def __init__(self, file_path):
+        if hasattr(self, "_file_path"):
+            return
+        self._file_path = pathlib.Path(file_path)
+        self._file_lock = None
+        self._lock = threading.RLock()
+        atexit.register(self._clean)
+    def __enter__(self):
+        self._file_lock = self._file_path.open("a")
+        fcntl.flock(self._file_lock.fileno(), fcntl.LOCK_EX)
+        self._lock.acquire()
+    def __exit__(self, exc_type, exc_value, traceback):
+        fcntl.flock(self._file_lock.fileno(), fcntl.LOCK_UN)
+        self._lock.release()
+    def _clean(self):
+        if self._file_lock is not None:
+            self._file_lock.close()
+        try:
+            self._file_path.unlink(missing_ok=True)
+        except OSError as e:
+            LOGGER.warning(f"Could not remove lock file {self._file_path}; {e}")
+class _Popen(multiprocessing.popen_spawn_posix.Popen):
+    def _launch(self, process_obj):
+        # Modified version of multiprocessing.popen_spawn_posix.Popen._launch
+        import io
+        import os
+        from multiprocessing import context, resource_tracker, spawn, util
+        tracker_fd = resource_tracker.getfd()
+        self._fds.append(tracker_fd)  # pytype: disable=attribute-error
+        # get prep_data + remove init_main_from* as they are not required for TensorStore process
+        prep_data = spawn.get_preparation_data(process_obj._name)
+        prep_data.pop("init_main_from_module", None)
+        prep_data.pop("init_main_from_path", None)
+        fp = io.BytesIO()
+        context.set_spawning_popen(self)
+        try:
+            context.reduction.dump(prep_data, fp)  # pytype: disable=module-attr
+            context.reduction.dump(process_obj, fp)  # pytype: disable=module-attr
+        finally:
+            context.set_spawning_popen(None)
+        parent_r = child_w = child_r = parent_w = None
+        try:
+            parent_r, child_w = os.pipe()
+            child_r, parent_w = os.pipe()
+            cmd = spawn.get_command_line(tracker_fd=tracker_fd, pipe_handle=child_r)
+            self._fds.extend([child_r, child_w])  # pytype: disable=attribute-error
+            self.pid = util.spawnv_passfds(
+                spawn.get_executable(),
+                cmd,
+                self._fds,  # pytype: disable=attribute-error,wrong-arg-types
+            )
+            self.sentinel = parent_r
+            with open(parent_w, "wb", closefd=False) as f:
+                f.write(fp.getbuffer())
+        finally:
+            fds_to_close = []
+            for fd in (parent_r, parent_w):
+                if fd is not None:
+                    fds_to_close.append(fd)
+            self.finalizer = util.Finalize(self, util.close_fds, fds_to_close)  # pytype: disable=module-attr
+            for fd in (child_r, child_w):
+                if fd is not None:
+                    os.close(fd)
+class _SpawnProcess(multiprocessing.process.BaseProcess):
+    _start_method = "spawn"
+    @staticmethod
+    def _Popen(process_obj):  # noqa N802
+        return _Popen(process_obj)
+class _SpawnContext(multiprocessing.context.BaseContext):
+    _name = "spawn"
+    Process = _SpawnProcess
+class TensorStore:
+    """Tensor store for storing and retrieving numpy arrays in/from shared memory."""
+    _SOCKET_EXISTANCE_CHECK_INTERVAL_S = 0.1
+    _instances = {}
+    def __new__(cls, *args, **kwargs):
+        """Create TensorStore object. If object with given address already exists, return it."""
+        if args:
+            address = args[0]
+        elif "address" in kwargs:
+            address = kwargs["address"]
+        else:
+            raise TypeError("TensorStore() missing 1 required positional argument: 'address'")
+        address = address.as_posix() if isinstance(address, pathlib.Path) else address
+        if address not in cls._instances:
+            cls._instances[address] = super().__new__(cls)
+        return cls._instances[address]
+    def __init__(self, address: Union[str, pathlib.Path], auth_key: Optional[bytes] = None):
+        """Initialize TensorStore object.
+        Args:
+            address: address of data store
+            auth_key: authentication key required to setup connection. If not provided, current process authkey will be used
+        """
+        if not hasattr(self, "_remote_blocks_store_manager"):
+            address = address.as_posix() if isinstance(address, pathlib.Path) else address
+            self._remote_blocks_store_manager = BlocksStoreManager(address, authkey=auth_key, ctx=_SpawnContext())
+            self._remote_blocks_store = None
+            self._manager_start_stop_filelock = _FileLock(f"{address}.lock")
+            # container for keeping map between tensor_id and numpy array weak ref
+            self._handled_blocks: Dict[str, weakref.ReferenceType] = {}
+            self._handled_blocks_lock = threading.RLock()
+            self._shm_segments: Dict[str, multiprocessing.shared_memory.SharedMemory] = {}
+            self._shm_segments_lock = threading.RLock()
+            self.serialize = serialize_numpy_with_struct_header
+            self.deserialize = deserialize_numpy_with_struct_header
+            self._calc_serialized_tensor_size = calc_serialized_size_of_numpy_with_struct_header
+    @property
+    def address(self) -> str:
+        """Return address of remote block store."""
+        return self._remote_blocks_store_manager.address
+    def start(self):
+        """Start remote block store."""
+        with self._manager_start_stop_filelock:
+            if self._remote_blocks_store is not None:
+                raise RuntimeError("Remote block store is already started/connected")
+            self._remote_blocks_store_manager.start()
+            self._remote_blocks_store = self._remote_blocks_store_manager.blocks()  # pytype: disable=attribute-error
+            address = pathlib.Path(self._remote_blocks_store_manager.address)
+            self._wait_for_address(address)
+            LOGGER.debug(
+                f"Started remote block store at {address} (pid={self._remote_blocks_store_manager._process.pid})"  # pytype: disable=attribute-error
+            )
+    def connect(self, timeout_s: Optional[float] = None):
+        """Connect to remote block store."""
+        if self._remote_blocks_store is None:
+            address = pathlib.Path(self._remote_blocks_store_manager.address)
+            self._wait_for_address(address, timeout_s)
+            self._remote_blocks_store_manager.connect()
+            self._remote_blocks_store = self._remote_blocks_store_manager.blocks()  # pytype: disable=attribute-error
+            LOGGER.debug(f"Connected to remote block store at {address})")
+        else:
+            LOGGER.debug(f"Already connectd to remote block store at {self.address}")
+    def _wait_for_address(self, address, timeout_s: Optional[float] = None):
+        should_stop_at = time.time() + timeout_s if timeout_s is not None else None
+        if timeout_s is not None and self._SOCKET_EXISTANCE_CHECK_INTERVAL_S > timeout_s:
+            socket_existance_check_interval = timeout_s
+        else:
+            socket_existance_check_interval = self._SOCKET_EXISTANCE_CHECK_INTERVAL_S
+        while not address.exists():
+            if should_stop_at is not None and time.time() >= should_stop_at:
+                raise TimeoutError(f"Timeout while waiting for {address} to be created")
+            time.sleep(socket_existance_check_interval)
+    def _calc_serialized_size(self, tensor: np.ndarray) -> int:
+        # frames payload sum + total size + frames sizes
+        # assume 2 frames: header with tensor description + data
+        return sum(self._calc_serialized_tensor_size(tensor)) + struct.calcsize("<I") + 2 * struct.calcsize("<I")
+    def put(self, tensors: Sequence[np.ndarray]) -> Sequence[str]:
+        """Append tensor to shared memory buffer.
+        Args:
+            tensors: numpy arrays to store
+        Returns:
+            List of ids of stored tensors
+        """
+        byte_size_of_frames_containers = [self._calc_serialized_size(tensor) for tensor in tensors]
+        tensors_ids = self._remote_blocks_store.get_free_blocks(byte_size_of_frames_containers)
+        blocks = [BlockDescriptor.from_id(tensor_id) for tensor_id in tensors_ids]
+        for tensor, block in zip(tensors, blocks):
+            with self._shm_segments_lock:
+                shm = self._shm_segments.get(block.shm_name)
+                if shm is None:
+                    shm = multiprocessing.shared_memory.SharedMemory(block.shm_name, create=False)
+                    self._shm_segments[block.shm_name] = shm
+            frames = self.serialize(tensor)
+            self._copy_frames(frames, shm, block.offset)
+        return tensors_ids
+    def get(self, tensor_id: str) -> np.ndarray:
+        """Get numpy array from tensor store.
+        Args:
+            tensor_id: id of of tenosr to get
+        Returns:
+            numpy array
+        """
+        tensor = None
+        # try to handle already handled tensor from weakref
+        with self._handled_blocks_lock:
+            tensor_ref = self._handled_blocks.get(tensor_id)
+            if tensor_ref is not None:
+                tensor = tensor_ref()
+        if tensor is None:  # if tensor was not handled yet or weakref is already empty
+            block = BlockDescriptor.from_id(tensor_id)
+            # check if shm segment is already opened
+            with self._shm_segments_lock:
+                shm = self._shm_segments.get(block.shm_name)
+            # if not open it and put into cache
+            if shm is None:
+                shm = multiprocessing.shared_memory.SharedMemory(block.shm_name, create=False)
+                with self._shm_segments_lock:
+                    shm = self._shm_segments.setdefault(block.shm_name, shm)  # in meantime other thread could create it
+            frames = self._handle_frames(shm, block.offset)
+            tensor = self.deserialize(frames)
+            # store tensor in weakref to be able to release shared memory when tensor will be garbage collected
+            with self._handled_blocks_lock:
+                tensor_ref = self._handled_blocks.setdefault(tensor_id, weakref.ref(tensor))
+                tensor = tensor_ref()
+        return tensor  # pytype: disable=bad-return-type
+    def release_block(self, tensor_id: str):
+        """Release shared memory block.
+        Args:
+            tensor_id: id of tensor to release
+        """
+        tensor_ref = None
+        with self._handled_blocks_lock:
+            tensor_ref = self._handled_blocks.pop(tensor_id, None)
+        try:
+            if tensor_ref is not None:
+                self._remote_blocks_store.release_block(tensor_id)
+        except OSError:  # thrown when remote process is already closed
+            LOGGER.warning(
+                f"Failed to release block {tensor_id} on remote process at {self.address}. Probably remote process is already closed"
+            )
+    def _copy_frames(
+        self,
+        frames: List[Union[bytes, memoryview]],
+        shm: multiprocessing.shared_memory.SharedMemory,
+        offset: int,
+    ) -> int:
+        total_size = struct.calcsize("<I")  # start after total_size; max 4GB for all frames
+        for frame in frames:
+            if isinstance(frame, bytes):
+                frame = memoryview(frame)
+            assert frame.contiguous, "Only contiguous arrays are supported"
+            struct.pack_into("<I", shm.buf, offset + total_size, frame.nbytes)  # pytype: disable=wrong-arg-types
+            total_size += struct.calcsize("<I")
+            shm.buf[offset + total_size : offset + total_size + frame.nbytes] = frame.cast("B")
+            total_size += frame.nbytes
+        struct.pack_into("<I", shm.buf, offset, total_size)  # pytype: disable=wrong-arg-types
+        return total_size
+    def _handle_frames(self, shm: multiprocessing.shared_memory.SharedMemory, block_offset: int) -> List[memoryview]:
+        frames = []
+        (total_size,) = struct.unpack_from("<I", shm.buf, block_offset)  # pytype: disable=wrong-arg-types
+        offset = struct.calcsize("<I")
+        while offset < total_size:
+            (frame_size,) = struct.unpack_from("<I", shm.buf, block_offset + offset)  # pytype: disable=wrong-arg-types
+            offset += struct.calcsize("<I")
+            frame = shm.buf[block_offset + offset : block_offset + offset + frame_size]
+            offset += frame_size
+            frames.append(frame)
+        return frames
+    def close(self):
+        """Free resources used by TensorStore object."""
+        from multiprocessing.resource_tracker import register, unregister
+        LOGGER.debug(f"TensorStore is being closed (is_started={self.is_started()})")
+        gc.collect()
+        with self._handled_blocks_lock:
+            tensors_ids = list(self._handled_blocks)
+            for tensor_id in tensors_ids:
+                self.release_block(tensor_id)
+        with self._shm_segments_lock:
+            while self._shm_segments:
+                _, shm = self._shm_segments.popitem()
+                LOGGER.debug(f"Closing shared memory {shm.name}")
+                try:
+                    shm.close()
+                except Exception as e:
+                    LOGGER.warning(f"Failed to close shared memory {shm.name}: {e}")
+                finally:
+                    if not self.is_started():
+                        register(shm._name, "shared_memory")  # pytype: disable=attribute-error
+                        unregister(shm._name, "shared_memory")  # pytype: disable=attribute-error
+        if self.is_started():
+            if self._remote_blocks_store is not None:
+                LOGGER.debug(f"Releasing all resources on remote process at {self.address}")
+                try:
+                    self._remote_blocks_store.close()
+                except FileNotFoundError:  # thrown when remote process is already closed
+                    pass
+            self._remote_blocks_store = None
+            LOGGER.debug(f"Shutting down side process of data store at {self.address}")
+            self._remote_blocks_store_manager.shutdown()
+        LOGGER.debug(f"TensorStore at {self.address} closed")
+    def is_started(self) -> bool:
+        """Check if remote block store was started by this instance.
+        Returns:
+            True if remote block store was started by this instance, False otherwise
+        """
+        return hasattr(self._remote_blocks_store_manager, "shutdown")
+def get_debug_status(tensor_store: TensorStore) -> dict:
+    """Get debug status of remote block store.
+    Args:
+        tensor_store: TensorStore object
+    Returns:
+        Debug status of remote block store
+    """
+    if tensor_store._remote_blocks_store is None:
+        raise RuntimeError("Remote block store is not initialized")
+    return tensor_store._remote_blocks_store._get_debug_status()
+class BaseRequestsResponsesSerializerDeserializer(abc.ABC):
+    """Base class for requests/responses serializer/deserializer."""
+    @abc.abstractmethod
+    def serialize_requests(self, requests: Requests) -> bytes:
+        """Serialize requests.
+        Args:
+            requests: list of requests to serialize
+        Returns:
+            Serialized requests
+        """
+        pass
+    @abc.abstractmethod
+    def deserialize_requests(self, requests_payload: bytes) -> Requests:
+        """Deserialize requests.
+        Args:
+            requests_payload: serialized requests
+        Returns:
+            List of deserialized requests
+        """
+        pass
+    @abc.abstractmethod
+    def free_requests_resources(self, requests_payload: bytes):
+        """Free resources used by requests."""
+        pass
+    @abc.abstractmethod
+    def serialize_responses(self, responses: Responses) -> bytes:
+        """Serialize responses.
+        Args:
+            responses: list of responses to serialize
+        Returns:
+            Serialized responses
+        """
+        pass
+    @abc.abstractmethod
+    def deserialize_responses(self, responses_payload: bytes) -> Responses:
+        """Deserialize responses.
+        Args:
+            responses_payload: serialized responses
+        Returns:
+            List of deserialized responses
+        """
+        pass
+    @abc.abstractmethod
+    def free_responses_resources(self, responses_payload: bytes):
+        """Free resources used by responses."""
+        pass
+class Base64SerializerDeserializer(BaseRequestsResponsesSerializerDeserializer):
+    """Serializer/deserializer for requests/responses using base64 implementation."""
+    def serialize_requests(self, requests: Requests) -> bytes:
+        """Serialize requests.
+        Args:
+            requests: list of requests to serialize
+        Returns:
+            Serialized requests
+        """
+        serialized_requests = self._serialize_named_tensors_lists(requests)
+        requests_list = []
+        for request, serialized_request in zip(requests, serialized_requests):
+            serialized_request = {"data": serialized_request, "parameters": request.parameters}
+            if request.span is not None:
+                serialized_request["span"] = get_span_dict(request.span)
+            requests_list.append(serialized_request)
+        requests = {"requests": requests_list}
+        requests = json.dumps(requests).encode("utf-8")
+        return requests
+    def deserialize_requests(self, requests_payload: bytes) -> Requests:
+        """Deserialize requests.
+        Args:
+            requests_payload: serialized requests
+        Returns:
+            List of deserialized requests
+        """
+        requests = json.loads(requests_payload)
+        requests_data = [request["data"] for request in requests["requests"]]
+        requests_data = self._deserialized_named_tensors_lists(requests_data)
+        deserialized_requests = []
+        for request, request_data in zip(requests["requests"], requests_data):
+            kwargs = {"data": request_data, "parameters": request.get("parameters")}
+            # FIXME: move span creation above just after json.loads
+            if "span" in request:
+                span_dict = request["span"]
+                span = start_span_from_remote(span_dict, "proxy_inference_callable")
+                kwargs["span"] = span
+            request_wrapped = Request(**kwargs)
+            deserialized_requests.append(request_wrapped)
+        return deserialized_requests
+    def free_requests_resources(self, requests_payload: bytes):
+        """Free resources used by requests."""
+        pass
+    def serialize_responses(self, responses: Responses) -> bytes:
+        """Serialize responses.
+        Args:
+            responses: list of responses to serialize
+        Returns:
+            Serialized responses
+        """
+        responses = self._serialize_named_tensors_lists(responses)
+        responses = {"responses": [{"data": response} for response in responses]}
+        return json.dumps(responses).encode("utf-8")
+    def deserialize_responses(self, responses_payload: bytes) -> Responses:
+        """Deserialize responses.
+        Args:
+            responses_payload: serialized responses
+        Returns:
+            List of deserialized responses
+        """
+        if responses_payload:
+            responses = json.loads(responses_payload)
+            responses = [response["data"] for response in responses["responses"]]
+            responses = self._deserialized_named_tensors_lists(responses)
+            return [Response(data=response) for response in responses]
+        else:
+            return []
+    def free_responses_resources(self, responses_payload: bytes):
+        """Free resources used by responses."""
+        pass
+    def _serialize_named_tensors_lists(self, named_tensors_lists):
+        def _encode(_tensor):
+            frames = serialize_numpy_with_struct_header(_tensor)
+            return [base64.b64encode(frame).decode("utf-8") for frame in frames]
+        return [
+            {tensor_name: _encode(tensor) for tensor_name, tensor in tensors.items()} for tensors in named_tensors_lists
+        ]
+    def _deserialized_named_tensors_lists(self, named_tensors_lists):
+        def _decode(decoded_tensor):
+            frames = [base64.b64decode(frame.encode("utf-8")) for frame in decoded_tensor]
+            return deserialize_numpy_with_struct_header(frames)
+        return [
+            {tensor_name: _decode(encoded_tensor) for tensor_name, encoded_tensor in tensors.items()}
+            for tensors in named_tensors_lists
+        ]
+    def start(self, url: Union[str, pathlib.Path], authkey: Optional[bytes] = None):
+        """Start Dummy implementation.
+        Args:
+            url: address of data store
+            authkey: authentication key required to setup connection. If not provided, current process authkey will be used
+        """
+        pass
+    def connect(self, url: Union[str, pathlib.Path], authkey: Optional[bytes] = None):
+        """Connect to Dummy implementation.
+        Args:
+            url: address of data store
+            authkey: authentication key required to setup connection. If not provided, current process authkey will be used
+        """
+        pass
+    def close(self):
+        """Close Dummy implementation."""
+        pass
+class TensorStoreSerializerDeserializer(BaseRequestsResponsesSerializerDeserializer):
+    """Serializer/deserializer for requests/responses using TensorStore."""
+    def __init__(self):
+        """Initialize TensorStoreSerializerDeserializer object."""
+        self._tensor_store = None
+    def serialize_requests(self, requests: Requests) -> bytes:
+        """Serialize requests.
+        Args:
+            requests: list of requests to serialize
+        Returns:
+            Serialized requests
+        """
+        serialized_requests = self._serialize_named_tensors_lists(requests)
+        requests_list = []
+        for request, serialized_request in zip(requests, serialized_requests):
+            serialized_request = {"data": serialized_request, "parameters": request.parameters}
+            if request.span is not None:
+                serialized_request["span"] = get_span_dict(request.span)
+            requests_list.append(serialized_request)
+        requests = {"requests": requests_list}
+        return json.dumps(requests).encode("utf-8")
+    def deserialize_requests(self, requests_payload: bytes) -> Requests:
+        """Deserialize requests.
+        Args:
+            requests_payload: serialized requests
+        Returns:
+            List of deserialized requests
+        """
+        requests = json.loads(requests_payload)
+        deserialized_requests = []
+        for request in requests["requests"]:
+            kwargs = {}
+            if "span" in request:
+                span_dict = request["span"]
+                span = start_span_from_remote(span_dict, "proxy_inference_callable")
+                kwargs["span"] = span
+            request_data = {
+                input_name: self._tensor_store.get(tensor_id)
+                for input_name, tensor_id in request.get("data", {}).items()
+            }
+            kwargs["data"] = request_data
+            kwargs["parameters"] = request.get("parameters")
+            request_wrapped = Request(**kwargs)
+            deserialized_requests.append(request_wrapped)
+        return deserialized_requests
+    def free_requests_resources(self, requests_payload: bytes):
+        """Free resources used by requests."""
+        if requests_payload:
+            requests = json.loads(requests_payload)
+            for response in requests["requests"]:
+                for _, tensor_id in response.get("data", {}).items():
+                    self._tensor_store.release_block(tensor_id)
+    def serialize_responses(self, responses: Responses) -> bytes:
+        """Serialize responses.
+        Args:
+            responses: list of responses to serialize
+        Returns:
+            Serialized responses
+        """
+        responses = self._serialize_named_tensors_lists(responses)
+        responses = {"responses": [{"data": response} for response in responses]}
+        return json.dumps(responses).encode("utf-8")
+    def deserialize_responses(self, responses_payload: bytes) -> Responses:
+        """Deserialize responses.
+        Args:
+            responses_payload: serialized responses
+        Returns:
+            List of deserialized responses
+        """
+        if responses_payload:
+            responses = json.loads(responses_payload)
+            return [
+                Response(
+                    data={
+                        input_name: self._tensor_store.get(tensor_id)
+                        for input_name, tensor_id in response.get("data", {}).items()
+                    }
+                )
+                for response in responses["responses"]
+            ]
+        else:
+            return []
+    def free_responses_resources(self, responses_payload: bytes):
+        """Free resources used by responses."""
+        if responses_payload:
+            responses = json.loads(responses_payload)
+            for response in responses["responses"]:
+                for _, tensor_id in response.get("data", {}).items():
+                    self._tensor_store.release_block(tensor_id)
+    def _serialize_named_tensors_lists(self, named_tensors_lists):
+        values_with_coords = [
+            (idx, tensor_name, tensor)
+            for idx, tensors in enumerate(named_tensors_lists)
+            for tensor_name, tensor in tensors.items()
+        ]
+        tensor_ids = self._tensor_store.put([tensor for _, _, tensor in values_with_coords])
+        named_tensors_lists = [{} for _ in range(len(named_tensors_lists))]
+        for (idx, tensor_name, _), tensor_id in zip(values_with_coords, tensor_ids):
+            named_tensors_lists[idx][tensor_name] = tensor_id
+        return named_tensors_lists
+    def start(self, url: Union[str, pathlib.Path], authkey: Optional[bytes] = None):
+        """Start TensorStore.
+        Args:
+            url: address of data store
+            authkey: authentication key required to setup connection. If not provided, current process authkey will be used
+        """
+        self._tensor_store = self._create(url, authkey)
+        self._tensor_store.start()
+    def connect(self, url: Union[str, pathlib.Path], authkey: Optional[bytes] = None):
+        """Connect to TensorStore.
+        Args:
+            url: address of data store
+            authkey: authentication key required to setup connection. If not provided, current process authkey will be used
+        """
+        self._tensor_store = self._create(url, authkey)
+        self._tensor_store.connect()
+    def _create(self, url: Union[str, pathlib.Path], authkey: Optional[bytes] = None):
+        authkey = authkey or multiprocessing.current_process().authkey
+        return TensorStore(url, authkey)
+    def close(self):
+        """Close TensorStore."""
+        if self._tensor_store:
+            # check if run by this serializer/deserializer
+            if self._tensor_store.is_started():
+                debug_status = get_debug_status(self._tensor_store)
+                used_blocks = [block for segment in debug_status["segments"] for block in segment["used_blocks"]]
+                if used_blocks:
+                    LOGGER.debug(f"TensorStore used blocks while closing: {used_blocks}")
+                    # raise RuntimeError(
+                    #     f"TensorStore at {self._tensor_store.address} is still running. Used blocks: {used_blocks}"
+                    # )
+                LOGGER.debug(f"Closing TensorStore process at {self._tensor_store.address}")
+            self._tensor_store.close()
+            self._tensor_store = None