Spaces:
Running
Running
Added GGUF and CTranslate2 versions of the ALMA model, and the web UI options now include VAD Process Timeout.
Browse files1. In the web UI, in addition to the GPTQ version, GGUF and CTranslate2 versions of the ALMA model have also been added.
2. Due to the poor support of GPTQ for CPUs. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.
3. In the web UI's VAD options, the "VAD Process Timeout (s)" (vad_process_timeout) has been added, allowing users to decide whether to continue maintaining the VAD process until the specified timeout. VRAM will continue to be occupied as long as the VAD process is not stopped. The default value for this timeout is 1800 seconds (30 minutes).
- app.py +5 -1
- config.json5 +26 -0
- docs/options.md +3 -0
- docs/translateModel.md +24 -4
- requirements-fasterWhisper.txt +4 -2
- requirements-whisper.txt +4 -2
- requirements.txt +4 -2
- src/config.py +3 -1
- src/translation/translationModel.py +48 -20
app.py
CHANGED
@@ -250,6 +250,7 @@ class WhisperTranscriber:
|
|
250 |
vadPadding: float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
|
251 |
vadPromptWindow: float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
|
252 |
vadInitialPromptMode: str = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
|
|
|
253 |
|
254 |
diarization: bool = decodeOptions.pop("diarization", False)
|
255 |
diarization_speakers: int = decodeOptions.pop("diarization_speakers", 2)
|
@@ -832,7 +833,9 @@ def create_ui(app_config: ApplicationConfig):
|
|
832 |
m2m100_models = app_config.get_model_names("m2m100")
|
833 |
mt5_models = app_config.get_model_names("mt5")
|
834 |
ALMA_models = app_config.get_model_names("ALMA")
|
835 |
-
|
|
|
|
|
836 |
common_whisper_inputs = lambda : {
|
837 |
gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
|
838 |
gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
|
@@ -864,6 +867,7 @@ def create_ui(app_config: ApplicationConfig):
|
|
864 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
|
865 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
|
866 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
|
|
|
867 |
}
|
868 |
|
869 |
common_word_timestamps_inputs = lambda : {
|
|
|
250 |
vadPadding: float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
|
251 |
vadPromptWindow: float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
|
252 |
vadInitialPromptMode: str = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
|
253 |
+
self.vad_process_timeout: float = decodeOptions.pop("vadPocessTimeout", self.vad_process_timeout)
|
254 |
|
255 |
diarization: bool = decodeOptions.pop("diarization", False)
|
256 |
diarization_speakers: int = decodeOptions.pop("diarization_speakers", 2)
|
|
|
833 |
m2m100_models = app_config.get_model_names("m2m100")
|
834 |
mt5_models = app_config.get_model_names("mt5")
|
835 |
ALMA_models = app_config.get_model_names("ALMA")
|
836 |
+
if not torch.cuda.is_available(): #Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.
|
837 |
+
ALMA_models = list(filter(lambda alma: "GPTQ" not in alma, ALMA_models))
|
838 |
+
|
839 |
common_whisper_inputs = lambda : {
|
840 |
gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
|
841 |
gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
|
|
|
867 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
|
868 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
|
869 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
|
870 |
+
gr.Number(label="VAD - Process Timeout (s)", precision=0, value=app_config.vad_process_timeout, elem_id="vadPocessTimeout"),
|
871 |
}
|
872 |
|
873 |
common_word_timestamps_inputs = lambda : {
|
config.json5
CHANGED
@@ -203,6 +203,32 @@
|
|
203 |
"url": "TheBloke/ALMA-13B-GPTQ",
|
204 |
"type": "huggingface"
|
205 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
]
|
207 |
},
|
208 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
|
|
203 |
"url": "TheBloke/ALMA-13B-GPTQ",
|
204 |
"type": "huggingface"
|
205 |
},
|
206 |
+
{
|
207 |
+
"name": "ALMA-7B-GGUF-Q4_K_M/TheBloke",
|
208 |
+
"url": "TheBloke/ALMA-7B-GGUF",
|
209 |
+
"type": "huggingface",
|
210 |
+
"model_file": "alma-7b.Q4_K_M.gguf",
|
211 |
+
"tokenizer_url": "haoranxu/ALMA-7B"
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"name": "ALMA-13B-GGUF-Q4_K_M/TheBloke",
|
215 |
+
"url": "TheBloke/ALMA-13B-GGUF",
|
216 |
+
"type": "huggingface",
|
217 |
+
"model_file": "alma-13b.Q4_K_M.gguf",
|
218 |
+
"tokenizer_url": "haoranxu/ALMA-13B"
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"name": "ALMA-7B-ct2:int8_float16/avan",
|
222 |
+
"url": "avans06/ALMA-7B-ct2-int8_float16",
|
223 |
+
"type": "huggingface",
|
224 |
+
"tokenizer_url": "haoranxu/ALMA-7B"
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"name": "ALMA-13B-ct2:int8_float16/avan",
|
228 |
+
"url": "avans06/ALMA-13B-ct2-int8_float16",
|
229 |
+
"type": "huggingface",
|
230 |
+
"tokenizer_url": "haoranxu/ALMA-13B"
|
231 |
+
},
|
232 |
]
|
233 |
},
|
234 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
docs/options.md
CHANGED
@@ -67,6 +67,9 @@ If set, any adjacent speech sections that are at most this number of seconds apa
|
|
67 |
## VAD - Max Merge Size (s)
|
68 |
Disables merging of adjacent speech sections if they are this number of seconds long.
|
69 |
|
|
|
|
|
|
|
70 |
## VAD - Padding (s)
|
71 |
The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
|
72 |
larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
|
|
|
67 |
## VAD - Max Merge Size (s)
|
68 |
Disables merging of adjacent speech sections if they are this number of seconds long.
|
69 |
|
70 |
+
## VAD - Process Timeout (s)
|
71 |
+
This configures the number of seconds until a process is killed due to inactivity, freeing RAM and video memory. The default value is 30 minutes.
|
72 |
+
|
73 |
## VAD - Padding (s)
|
74 |
The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
|
75 |
larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
|
docs/translateModel.md
CHANGED
@@ -20,7 +20,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
|
|
20 |
|
21 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
22 |
|------|------------|------|---------------|---------------|
|
23 |
-
| [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) |
|
24 |
| [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
|
25 |
| [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
|
26 |
|
@@ -28,7 +28,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
|
|
28 |
|
29 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
30 |
|------|------------|------|---------------|---------------|
|
31 |
-
| [michaelfeil/ct2fast-m2m100_418M](https://huggingface.co/michaelfeil/ct2fast-m2m100_418M) |
|
32 |
| [michaelfeil/ct2fast-m2m100_1.2B](https://huggingface.co/michaelfeil/ct2fast-m2m100_1.2B) | 1.2B | 2.48 GB | float16 | ≈1.3 GB |
|
33 |
| [michaelfeil/ct2fast-m2m100-12B-last-ckpt](https://huggingface.co/michaelfeil/ct2fast-m2m100-12B-last-ckpt) | 12B | 23.6 GB | float16 | N/A |
|
34 |
|
@@ -73,7 +73,6 @@ The 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. Th
|
|
73 |
|
74 |
## ALMA
|
75 |
|
76 |
-
ALMA is an excellent translation model, but it is strongly discouraged to operate it on CPU.
|
77 |
ALMA is a many-to-many LLM-based translation model introduced by Haoran Xu and colleagues in September 2023. It is based on the fine-tuning of a large language model (LLaMA-2). The approach used for this model is referred to as Advanced Language Model-based trAnslator (ALMA). The paper is titled "`A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models`" ([arXiv:2309.11674](https://arxiv.org/abs/2309.11674)).
|
78 |
The official support for ALMA currently includes 10 language directions: English↔German, English↔Czech, English↔Icelandic, English↔Chinese, and English↔Russian. However, the author hints that there might be surprises in other directions, so there are currently no restrictions on the languages that ALMA can be chosen for in the web UI.
|
79 |
|
@@ -84,12 +83,33 @@ The official support for ALMA currently includes 10 language directions: English
|
|
84 |
|
85 |
## ALMA-GPTQ
|
86 |
|
|
|
87 |
GPTQ is a technique used to quantize the parameters of large language models into integer formats such as int8 or int4. Although the quantization process may lead to a loss in model performance, it significantly reduces both file size and the required VRAM.
|
88 |
|
89 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
90 |
|------|------------|------|---------------|---------------|
|
91 |
| [TheBloke/ALMA-7B-GPTQ](https://huggingface.co/TheBloke/ALMA-7B-GPTQ) | 7B | 3.9 GB | 4 Bits | ≈4.3 GB |
|
92 |
-
| [TheBloke/ALMA-13B-GPTQ](https://huggingface.co/TheBloke/ALMA-13B-GPTQ) | 13B | 7.26 GB | 4 Bits | ≈8.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
|
95 |
# Options
|
|
|
20 |
|
21 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
22 |
|------|------------|------|---------------|---------------|
|
23 |
+
| [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
|
24 |
| [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
|
25 |
| [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
|
26 |
|
|
|
28 |
|
29 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
30 |
|------|------------|------|---------------|---------------|
|
31 |
+
| [michaelfeil/ct2fast-m2m100_418M](https://huggingface.co/michaelfeil/ct2fast-m2m100_418M) | 418M | 970 MB | float16 | ≈0.6 GB |
|
32 |
| [michaelfeil/ct2fast-m2m100_1.2B](https://huggingface.co/michaelfeil/ct2fast-m2m100_1.2B) | 1.2B | 2.48 GB | float16 | ≈1.3 GB |
|
33 |
| [michaelfeil/ct2fast-m2m100-12B-last-ckpt](https://huggingface.co/michaelfeil/ct2fast-m2m100-12B-last-ckpt) | 12B | 23.6 GB | float16 | N/A |
|
34 |
|
|
|
73 |
|
74 |
## ALMA
|
75 |
|
|
|
76 |
ALMA is a many-to-many LLM-based translation model introduced by Haoran Xu and colleagues in September 2023. It is based on the fine-tuning of a large language model (LLaMA-2). The approach used for this model is referred to as Advanced Language Model-based trAnslator (ALMA). The paper is titled "`A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models`" ([arXiv:2309.11674](https://arxiv.org/abs/2309.11674)).
|
77 |
The official support for ALMA currently includes 10 language directions: English↔German, English↔Czech, English↔Icelandic, English↔Chinese, and English↔Russian. However, the author hints that there might be surprises in other directions, so there are currently no restrictions on the languages that ALMA can be chosen for in the web UI.
|
78 |
|
|
|
83 |
|
84 |
## ALMA-GPTQ
|
85 |
|
86 |
+
Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, it is strongly discouraged to operate it on CPU.
|
87 |
GPTQ is a technique used to quantize the parameters of large language models into integer formats such as int8 or int4. Although the quantization process may lead to a loss in model performance, it significantly reduces both file size and the required VRAM.
|
88 |
|
89 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
90 |
|------|------------|------|---------------|---------------|
|
91 |
| [TheBloke/ALMA-7B-GPTQ](https://huggingface.co/TheBloke/ALMA-7B-GPTQ) | 7B | 3.9 GB | 4 Bits | ≈4.3 GB |
|
92 |
+
| [TheBloke/ALMA-13B-GPTQ](https://huggingface.co/TheBloke/ALMA-13B-GPTQ) | 13B | 7.26 GB | 4 Bits | ≈8.4 GB |
|
93 |
+
|
94 |
+
## ALMA-GGUF
|
95 |
+
|
96 |
+
[GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp.
|
97 |
+
GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.
|
98 |
+
[k-quants](https://github.com/ggerganov/llama.cpp/pull/1684): a series of 2-6 bit quantization methods, along with quantization mixes
|
99 |
+
|
100 |
+
| Name | Parameters | Size | type/quantize | Required VRAM |
|
101 |
+
|------|------------|------|---------------|---------------|
|
102 |
+
| [TheBloke/ALMA-7B-GGUF-Q4_K_M](https://huggingface.co/TheBloke/ALMA-7B-GGUF) | 7B | 4.08 GB | Q4_K_M(4 Bits medium) | ≈5.3 GB |
|
103 |
+
| [TheBloke/ALMA-13B-GGUF-Q4_K_M](https://huggingface.co/TheBloke/ALMA-13B-GGUF) | 13B | 7.87 GB | Q4_K_M(4 Bits medium) | ≈9.3 GB |
|
104 |
+
|
105 |
+
## ALMA-CTranslate2
|
106 |
+
|
107 |
+
[CTranslate2](https://opennmt.net/CTranslate2/) does not currently support 4-bit quantization. Currently, it can only use int8_float16 quantization, so the file size and required VRAM will be larger than the GPTQ model quantized with 4 bits. However, it runs much faster on the CPU than GPTQ. If you plan to run ALMA in an environment without a GPU, you may consider choosing the CTranslate2 version of the ALMA model.
|
108 |
+
|
109 |
+
| Name | Parameters | Size | type/quantize | Required VRAM |
|
110 |
+
|------|------------|------|---------------|---------------|
|
111 |
+
| [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
|
112 |
+
| [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
|
113 |
|
114 |
|
115 |
# Options
|
requirements-fasterWhisper.txt
CHANGED
@@ -17,7 +17,9 @@ srt
|
|
17 |
torch
|
18 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
19 |
|
20 |
-
# Needed by ALMA
|
21 |
accelerate
|
22 |
auto-gptq
|
23 |
-
optimum
|
|
|
|
|
|
17 |
torch
|
18 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
19 |
|
20 |
+
# Needed by ALMA-GPTQ
|
21 |
accelerate
|
22 |
auto-gptq
|
23 |
+
optimum
|
24 |
+
# Needed by ALMA-GGUL
|
25 |
+
ctransformers[cuda]
|
requirements-whisper.txt
CHANGED
@@ -16,7 +16,9 @@ srt
|
|
16 |
torch
|
17 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
18 |
|
19 |
-
# Needed by ALMA
|
20 |
accelerate
|
21 |
auto-gptq
|
22 |
-
optimum
|
|
|
|
|
|
16 |
torch
|
17 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
18 |
|
19 |
+
# Needed by ALMA-GPTQ
|
20 |
accelerate
|
21 |
auto-gptq
|
22 |
+
optimum
|
23 |
+
# Needed by ALMA-GGUL
|
24 |
+
ctransformers[cuda]
|
requirements.txt
CHANGED
@@ -17,7 +17,9 @@ srt
|
|
17 |
torch
|
18 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
19 |
|
20 |
-
# Needed by ALMA
|
21 |
accelerate
|
22 |
auto-gptq
|
23 |
-
optimum
|
|
|
|
|
|
17 |
torch
|
18 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
19 |
|
20 |
+
# Needed by ALMA-GPTQ
|
21 |
accelerate
|
22 |
auto-gptq
|
23 |
+
optimum
|
24 |
+
# Needed by ALMA-GGUL
|
25 |
+
ctransformers[cuda]
|
src/config.py
CHANGED
@@ -5,7 +5,7 @@ from typing import List, Dict, Literal
|
|
5 |
|
6 |
|
7 |
class ModelConfig:
|
8 |
-
def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None, revision: str = None):
|
9 |
"""
|
10 |
Initialize a model configuration.
|
11 |
|
@@ -17,6 +17,7 @@ class ModelConfig:
|
|
17 |
It can be a branch name, a tag name, or a commit id,
|
18 |
since we use a git-based system for storing models and other artifacts on huggingface.co,
|
19 |
so revision can be any identifier allowed by git.
|
|
|
20 |
"""
|
21 |
self.name = name
|
22 |
self.url = url
|
@@ -24,6 +25,7 @@ class ModelConfig:
|
|
24 |
self.type = type
|
25 |
self.tokenizer_url = tokenizer_url
|
26 |
self.revision = revision
|
|
|
27 |
|
28 |
VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
|
29 |
|
|
|
5 |
|
6 |
|
7 |
class ModelConfig:
|
8 |
+
def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None, revision: str = None, model_file: str = None,):
|
9 |
"""
|
10 |
Initialize a model configuration.
|
11 |
|
|
|
17 |
It can be a branch name, a tag name, or a commit id,
|
18 |
since we use a git-based system for storing models and other artifacts on huggingface.co,
|
19 |
so revision can be any identifier allowed by git.
|
20 |
+
model_file: The name of the model file in repo or directory.[from marella/ctransformers]
|
21 |
"""
|
22 |
self.name = name
|
23 |
self.url = url
|
|
|
25 |
self.type = type
|
26 |
self.tokenizer_url = tokenizer_url
|
27 |
self.revision = revision
|
28 |
+
self.model_file = model_file
|
29 |
|
30 |
VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
|
31 |
|
src/translation/translationModel.py
CHANGED
@@ -65,7 +65,7 @@ class TranslationModel:
|
|
65 |
if os.path.isdir(modelConfig.url):
|
66 |
self.modelPath = modelConfig.url
|
67 |
else:
|
68 |
-
self.modelPath = download_model(
|
69 |
modelConfig,
|
70 |
localFilesOnly=localFilesOnly,
|
71 |
cacheDir=downloadRoot,
|
@@ -137,6 +137,12 @@ class TranslationModel:
|
|
137 |
If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel.
|
138 |
This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
|
139 |
https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization.md#exllama
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
"""
|
141 |
try:
|
142 |
print('\n\nLoading model: %s\n\n' % self.modelPath)
|
@@ -152,7 +158,7 @@ class TranslationModel:
|
|
152 |
elif "ALMA" in self.modelPath:
|
153 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
|
154 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
155 |
-
self.transModel = ctranslate2.Generator(self.modelPath, device=self.device)
|
156 |
elif "mt5" in self.modelPath:
|
157 |
self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
|
158 |
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
|
@@ -160,16 +166,24 @@ class TranslationModel:
|
|
160 |
self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
|
161 |
elif "ALMA" in self.modelPath:
|
162 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
|
174 |
else:
|
175 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
|
@@ -180,22 +194,31 @@ class TranslationModel:
|
|
180 |
self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
|
181 |
|
182 |
except Exception as e:
|
183 |
-
print(traceback.format_exc())
|
184 |
self.release_vram()
|
|
|
|
|
185 |
|
186 |
def release_vram(self):
|
187 |
try:
|
188 |
if torch.cuda.is_available():
|
189 |
if "ct2" not in self.modelPath:
|
190 |
try:
|
191 |
-
|
192 |
-
|
|
|
193 |
except Exception as e:
|
194 |
print(traceback.format_exc())
|
195 |
print("\tself.transModel.to cpu, error: " + str(e))
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
try:
|
200 |
torch.cuda.empty_cache()
|
201 |
except Exception as e:
|
@@ -205,6 +228,7 @@ class TranslationModel:
|
|
205 |
gc.collect()
|
206 |
print("release vram end.")
|
207 |
except Exception as e:
|
|
|
208 |
print("Error release vram: " + str(e))
|
209 |
|
210 |
|
@@ -257,7 +281,10 @@ class TranslationModel:
|
|
257 |
output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
|
258 |
result = output[0]['generated_text']
|
259 |
elif "ALMA" in self.modelPath:
|
260 |
-
|
|
|
|
|
|
|
261 |
result = output[0]['generated_text']
|
262 |
else: #M2M100 & NLLB
|
263 |
output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
|
@@ -332,7 +359,8 @@ def download_model(
|
|
332 |
"vocab.json", #m2m100
|
333 |
"model.safetensors",
|
334 |
"quantize_config.json",
|
335 |
-
"tokenizer.model"
|
|
|
336 |
]
|
337 |
|
338 |
kwargs = {
|
|
|
65 |
if os.path.isdir(modelConfig.url):
|
66 |
self.modelPath = modelConfig.url
|
67 |
else:
|
68 |
+
self.modelPath = modelConfig.url if getattr(modelConfig, "model_file", None) is not None else download_model(
|
69 |
modelConfig,
|
70 |
localFilesOnly=localFilesOnly,
|
71 |
cacheDir=downloadRoot,
|
|
|
137 |
If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel.
|
138 |
This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
|
139 |
https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization.md#exllama
|
140 |
+
|
141 |
+
[ctransformers]
|
142 |
+
gpu_layers
|
143 |
+
means number of layers to run on GPU. Depending on how much GPU memory is available you can increase gpu_layers. Start with a larger value gpu_layers=100 and if it runs out of memory, try smaller values.
|
144 |
+
To run some of the model layers on GPU, set the `gpu_layers` parameter
|
145 |
+
https://github.com/marella/ctransformers/issues/68
|
146 |
"""
|
147 |
try:
|
148 |
print('\n\nLoading model: %s\n\n' % self.modelPath)
|
|
|
158 |
elif "ALMA" in self.modelPath:
|
159 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
|
160 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
161 |
+
self.transModel = ctranslate2.Generator(self.modelPath, compute_type="auto", device=self.device)
|
162 |
elif "mt5" in self.modelPath:
|
163 |
self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
|
164 |
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
|
|
|
166 |
self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
|
167 |
elif "ALMA" in self.modelPath:
|
168 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
169 |
+
if "GPTQ" in self.modelPath:
|
170 |
+
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
|
171 |
+
if self.device == "cpu":
|
172 |
+
# Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
|
173 |
+
# set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
|
174 |
+
transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
|
175 |
+
transModelConfig.quantization_config["use_exllama"] = False
|
176 |
+
self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
|
177 |
+
else:
|
178 |
+
# transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
|
179 |
+
self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
|
180 |
+
elif "GGUF" in self.modelPath:
|
181 |
+
import ctransformers
|
182 |
+
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
|
183 |
+
if self.device == "cpu":
|
184 |
+
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file)
|
185 |
+
else:
|
186 |
+
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50)
|
187 |
self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
|
188 |
else:
|
189 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
|
|
|
194 |
self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
|
195 |
|
196 |
except Exception as e:
|
|
|
197 |
self.release_vram()
|
198 |
+
raise e
|
199 |
+
|
200 |
|
201 |
def release_vram(self):
|
202 |
try:
|
203 |
if torch.cuda.is_available():
|
204 |
if "ct2" not in self.modelPath:
|
205 |
try:
|
206 |
+
if getattr(self, "transModel", None) is not None:
|
207 |
+
device = torch.device("cpu")
|
208 |
+
self.transModel.to(device)
|
209 |
except Exception as e:
|
210 |
print(traceback.format_exc())
|
211 |
print("\tself.transModel.to cpu, error: " + str(e))
|
212 |
+
if getattr(self, "transTranslator", None) is not None:
|
213 |
+
del self.transTranslator
|
214 |
+
if "ct2" in self.modelPath:
|
215 |
+
if getattr(self, "transModel", None) is not None and getattr(self.transModel, "unload_model", None) is not None:
|
216 |
+
self.transModel.unload_model()
|
217 |
+
|
218 |
+
if getattr(self, "transTokenizer", None) is not None:
|
219 |
+
del self.transTokenizer
|
220 |
+
if getattr(self, "transModel", None) is not None:
|
221 |
+
del self.transModel
|
222 |
try:
|
223 |
torch.cuda.empty_cache()
|
224 |
except Exception as e:
|
|
|
228 |
gc.collect()
|
229 |
print("release vram end.")
|
230 |
except Exception as e:
|
231 |
+
print(traceback.format_exc())
|
232 |
print("Error release vram: " + str(e))
|
233 |
|
234 |
|
|
|
281 |
output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
|
282 |
result = output[0]['generated_text']
|
283 |
elif "ALMA" in self.modelPath:
|
284 |
+
if "GPTQ" in self.modelPath:
|
285 |
+
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
286 |
+
elif "GGUF" in self.modelPath:
|
287 |
+
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
288 |
result = output[0]['generated_text']
|
289 |
else: #M2M100 & NLLB
|
290 |
output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
|
|
|
359 |
"vocab.json", #m2m100
|
360 |
"model.safetensors",
|
361 |
"quantize_config.json",
|
362 |
+
"tokenizer.model",
|
363 |
+
"vocabulary.json"
|
364 |
]
|
365 |
|
366 |
kwargs = {
|