whisper-webui-translate

Running

App Files Files Community

avans06 commited on Dec 15, 2023

Commit

8e80889

•

1 Parent(s): 1a74319

Added GGUF and CTranslate2 versions of the ALMA model, and the web UI options now include VAD Process Timeout.

Browse files

1. In the web UI, in addition to the GPTQ version, GGUF and CTranslate2 versions of the ALMA model have also been added.

2. Due to the poor support of GPTQ for CPUs. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.

3. In the web UI's VAD options, the "VAD Process Timeout (s)" (vad_process_timeout) has been added, allowing users to decide whether to continue maintaining the VAD process until the specified timeout. VRAM will continue to be occupied as long as the VAD process is not stopped. The default value for this timeout is 1800 seconds (30 minutes).

Files changed (9) hide show

app.py +5 -1
config.json5 +26 -0
docs/options.md +3 -0
docs/translateModel.md +24 -4
requirements-fasterWhisper.txt +4 -2
requirements-whisper.txt +4 -2
requirements.txt +4 -2
src/config.py +3 -1
src/translation/translationModel.py +48 -20

app.py CHANGED Viewed

@@ -250,6 +250,7 @@ class WhisperTranscriber:
             vadPadding:          float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
             vadPromptWindow:     float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
             vadInitialPromptMode: str  = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
             diarization:              bool = decodeOptions.pop("diarization", False)
             diarization_speakers:     int  = decodeOptions.pop("diarization_speakers", 2)
@@ -832,7 +833,9 @@ def create_ui(app_config: ApplicationConfig):
     m2m100_models = app_config.get_model_names("m2m100")
     mt5_models = app_config.get_model_names("mt5")
     ALMA_models = app_config.get_model_names("ALMA")
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
         gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
@@ -864,6 +867,7 @@ def create_ui(app_config: ApplicationConfig):
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
         gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
     }
     common_word_timestamps_inputs = lambda : {

             vadPadding:          float = decodeOptions.pop("vadPadding", self.app_config.vad_padding)
             vadPromptWindow:     float = decodeOptions.pop("vadPromptWindow", self.app_config.vad_prompt_window)
             vadInitialPromptMode: str  = decodeOptions.pop("vadInitialPromptMode", self.app_config.vad_initial_prompt_mode)
+            self.vad_process_timeout:    float = decodeOptions.pop("vadPocessTimeout", self.vad_process_timeout)
             diarization:              bool = decodeOptions.pop("diarization", False)
             diarization_speakers:     int  = decodeOptions.pop("diarization_speakers", 2)
     m2m100_models = app_config.get_model_names("m2m100")
     mt5_models = app_config.get_model_names("mt5")
     ALMA_models = app_config.get_model_names("ALMA")
+    if not torch.cuda.is_available(): #Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.
+        ALMA_models = list(filter(lambda alma: "GPTQ" not in alma, ALMA_models))
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
         gr.Dropdown(label="Whisper - Language", choices=sorted(get_lang_whisper_names()), value=app_config.language, elem_id="whisperLangName"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD", elem_id="vad"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window, elem_id="vadMergeWindow"),
         gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size, elem_id="vadMaxMergeSize"),
+        gr.Number(label="VAD - Process Timeout (s)", precision=0, value=app_config.vad_process_timeout, elem_id="vadPocessTimeout"),
     }
     common_word_timestamps_inputs = lambda : {

config.json5 CHANGED Viewed

@@ -203,6 +203,32 @@
         "url": "TheBloke/ALMA-13B-GPTQ",
         "type": "huggingface"
       },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.

         "url": "TheBloke/ALMA-13B-GPTQ",
         "type": "huggingface"
       },
+      {
+        "name": "ALMA-7B-GGUF-Q4_K_M/TheBloke",
+        "url": "TheBloke/ALMA-7B-GGUF",
+        "type": "huggingface",
+        "model_file": "alma-7b.Q4_K_M.gguf",
+        "tokenizer_url": "haoranxu/ALMA-7B"
+      },
+      {
+        "name": "ALMA-13B-GGUF-Q4_K_M/TheBloke",
+        "url": "TheBloke/ALMA-13B-GGUF",
+        "type": "huggingface",
+        "model_file": "alma-13b.Q4_K_M.gguf",
+        "tokenizer_url": "haoranxu/ALMA-13B"
+      },
+      {
+        "name": "ALMA-7B-ct2:int8_float16/avan",
+        "url": "avans06/ALMA-7B-ct2-int8_float16",
+        "type": "huggingface",
+        "tokenizer_url": "haoranxu/ALMA-7B"
+      },
+      {
+        "name": "ALMA-13B-ct2:int8_float16/avan",
+        "url": "avans06/ALMA-13B-ct2-int8_float16",
+        "type": "huggingface",
+        "tokenizer_url": "haoranxu/ALMA-13B"
+      },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.

docs/options.md CHANGED Viewed

@@ -67,6 +67,9 @@ If set, any adjacent speech sections that are at most this number of seconds apa
 ## VAD - Max Merge Size (s)
 Disables merging of adjacent speech sections if they are this number of seconds long.
 ## VAD - Padding (s)
 The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
 larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of

 ## VAD - Max Merge Size (s)
 Disables merging of adjacent speech sections if they are this number of seconds long.
+## VAD - Process Timeout (s)
+This configures the number of seconds until a process is killed due to inactivity, freeing RAM and video memory. The default value is 30 minutes.
 ## VAD - Padding (s)
 The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
 larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of

docs/translateModel.md CHANGED Viewed

@@ -20,7 +20,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
-| [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 480M | 1.94 GB | float32 | ≈2 GB |
 | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
 | [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
@@ -28,7 +28,7 @@ M2M100 is a multilingual translation model introduced by Facebook AI in October
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
-| [michaelfeil/ct2fast-m2m100_418M](https://huggingface.co/michaelfeil/ct2fast-m2m100_418M) | 480M | 970 MB | float16 | ≈0.6 GB |
 | [michaelfeil/ct2fast-m2m100_1.2B](https://huggingface.co/michaelfeil/ct2fast-m2m100_1.2B) | 1.2B | 2.48 GB | float16 | ≈1.3 GB |
 | [michaelfeil/ct2fast-m2m100-12B-last-ckpt](https://huggingface.co/michaelfeil/ct2fast-m2m100-12B-last-ckpt) | 12B | 23.6 GB | float16 | N/A |
@@ -73,7 +73,6 @@ The 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. Th
 ## ALMA
-ALMA is an excellent translation model, but it is strongly discouraged to operate it on CPU.
 ALMA is a many-to-many LLM-based translation model introduced by Haoran Xu and colleagues in September 2023. It is based on the fine-tuning of a large language model (LLaMA-2). The approach used for this model is referred to as Advanced Language Model-based trAnslator (ALMA). The paper is titled "`A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models`" ([arXiv:2309.11674](https://arxiv.org/abs/2309.11674)).
 The official support for ALMA currently includes 10 language directions: English↔German, English↔Czech, English↔Icelandic, English↔Chinese, and English↔Russian. However, the author hints that there might be surprises in other directions, so there are currently no restrictions on the languages that ALMA can be chosen for in the web UI.
@@ -84,12 +83,33 @@ The official support for ALMA currently includes 10 language directions: English
 ## ALMA-GPTQ
 GPTQ is a technique used to quantize the parameters of large language models into integer formats such as int8 or int4. Although the quantization process may lead to a loss in model performance, it significantly reduces both file size and the required VRAM.
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
 | [TheBloke/ALMA-7B-GPTQ](https://huggingface.co/TheBloke/ALMA-7B-GPTQ) | 7B | 3.9 GB | 4 Bits | ≈4.3 GB |
-| [TheBloke/ALMA-13B-GPTQ](https://huggingface.co/TheBloke/ALMA-13B-GPTQ) | 13B | 7.26 GB | 4 Bits | ≈8.1 |
 # Options

 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
+| [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) | 418M | 1.94 GB | float32 | ≈2 GB |
 | [facebook/m2m100_1.2B](https://huggingface.co/facebook/m2m100_1.2B) | 1.2B | 4.96 GB | float32 | ≈5 GB |
 | [facebook/m2m100-12B-last-ckpt](https://huggingface.co/facebook/m2m100-12B-last-ckpt) | 12B | 47.2 GB | float32 | N/A |
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
+| [michaelfeil/ct2fast-m2m100_418M](https://huggingface.co/michaelfeil/ct2fast-m2m100_418M) | 418M | 970 MB | float16 | ≈0.6 GB |
 | [michaelfeil/ct2fast-m2m100_1.2B](https://huggingface.co/michaelfeil/ct2fast-m2m100_1.2B) | 1.2B | 2.48 GB | float16 | ≈1.3 GB |
 | [michaelfeil/ct2fast-m2m100-12B-last-ckpt](https://huggingface.co/michaelfeil/ct2fast-m2m100-12B-last-ckpt) | 12B | 23.6 GB | float16 | N/A |
 ## ALMA
 ALMA is a many-to-many LLM-based translation model introduced by Haoran Xu and colleagues in September 2023. It is based on the fine-tuning of a large language model (LLaMA-2). The approach used for this model is referred to as Advanced Language Model-based trAnslator (ALMA). The paper is titled "`A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models`" ([arXiv:2309.11674](https://arxiv.org/abs/2309.11674)).
 The official support for ALMA currently includes 10 language directions: English↔German, English↔Czech, English↔Icelandic, English↔Chinese, and English↔Russian. However, the author hints that there might be surprises in other directions, so there are currently no restrictions on the languages that ALMA can be chosen for in the web UI.
 ## ALMA-GPTQ
+Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, it is strongly discouraged to operate it on CPU.
 GPTQ is a technique used to quantize the parameters of large language models into integer formats such as int8 or int4. Although the quantization process may lead to a loss in model performance, it significantly reduces both file size and the required VRAM.
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
 | [TheBloke/ALMA-7B-GPTQ](https://huggingface.co/TheBloke/ALMA-7B-GPTQ) | 7B | 3.9 GB | 4 Bits | ≈4.3 GB |
+| [TheBloke/ALMA-13B-GPTQ](https://huggingface.co/TheBloke/ALMA-13B-GPTQ) | 13B | 7.26 GB | 4 Bits | ≈8.4 GB |
+## ALMA-GGUF
+[GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp.
+GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.
+[k-quants](https://github.com/ggerganov/llama.cpp/pull/1684): a series of 2-6 bit quantization methods, along with quantization mixes
+| Name | Parameters | Size | type/quantize | Required VRAM |
+|------|------------|------|---------------|---------------|
+| [TheBloke/ALMA-7B-GGUF-Q4_K_M](https://huggingface.co/TheBloke/ALMA-7B-GGUF) | 7B | 4.08 GB | Q4_K_M(4 Bits medium) | ≈5.3 GB |
+| [TheBloke/ALMA-13B-GGUF-Q4_K_M](https://huggingface.co/TheBloke/ALMA-13B-GGUF) | 13B | 7.87 GB | Q4_K_M(4 Bits medium) | ≈9.3 GB |
+## ALMA-CTranslate2
+[CTranslate2](https://opennmt.net/CTranslate2/) does not currently support 4-bit quantization. Currently, it can only use int8_float16 quantization, so the file size and required VRAM will be larger than the GPTQ model quantized with 4 bits. However, it runs much faster on the CPU than GPTQ. If you plan to run ALMA in an environment without a GPU, you may consider choosing the CTranslate2 version of the ALMA model.
+| Name | Parameters | Size | type/quantize | Required VRAM |
+|------|------------|------|---------------|---------------|
+| [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
+| [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
 # Options

requirements-fasterWhisper.txt CHANGED Viewed

@@ -17,7 +17,9 @@ srt
 torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
-# Needed by ALMA(GPTQ)
 accelerate
 auto-gptq
-optimum

 torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
+# Needed by ALMA-GPTQ
 accelerate
 auto-gptq
+optimum
+# Needed by ALMA-GGUL
+ctransformers[cuda]

requirements-whisper.txt CHANGED Viewed

@@ -16,7 +16,9 @@ srt
 torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
-# Needed by ALMA(GPTQ)
 accelerate
 auto-gptq
-optimum

 torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
+# Needed by ALMA-GPTQ
 accelerate
 auto-gptq
+optimum
+# Needed by ALMA-GGUL
+ctransformers[cuda]

requirements.txt CHANGED Viewed

@@ -17,7 +17,9 @@ srt
 torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
-# Needed by ALMA(GPTQ)
 accelerate
 auto-gptq
-optimum

 torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
+# Needed by ALMA-GPTQ
 accelerate
 auto-gptq
+optimum
+# Needed by ALMA-GGUL
+ctransformers[cuda]

src/config.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import List, Dict, Literal
 class ModelConfig:
-    def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None, revision: str = None):
         """
         Initialize a model configuration.
@@ -17,6 +17,7 @@ class ModelConfig:
             It can be a branch name, a tag name, or a commit id,
             since we use a git-based system for storing models and other artifacts on huggingface.co,
             so revision can be any identifier allowed by git.
         """
         self.name = name
         self.url = url
@@ -24,6 +25,7 @@ class ModelConfig:
         self.type = type
         self.tokenizer_url = tokenizer_url
         self.revision = revision
 VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]

 class ModelConfig:
+    def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None, revision: str = None, model_file: str = None,):
         """
         Initialize a model configuration.
             It can be a branch name, a tag name, or a commit id,
             since we use a git-based system for storing models and other artifacts on huggingface.co,
             so revision can be any identifier allowed by git.
+        model_file: The name of the model file in repo or directory.[from marella/ctransformers]
         """
         self.name = name
         self.url = url
         self.type = type
         self.tokenizer_url = tokenizer_url
         self.revision = revision
+        self.model_file = model_file
 VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]

src/translation/translationModel.py CHANGED Viewed

@@ -65,7 +65,7 @@ class TranslationModel:
         if os.path.isdir(modelConfig.url):
             self.modelPath = modelConfig.url
         else:
-            self.modelPath = download_model(
                 modelConfig,
                 localFilesOnly=localFilesOnly,
                 cacheDir=downloadRoot,
@@ -137,6 +137,12 @@ class TranslationModel:
             If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel.
             This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
             https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization.md#exllama
         """
         try:
             print('\n\nLoading model: %s\n\n' % self.modelPath)
@@ -152,7 +158,7 @@ class TranslationModel:
                 elif "ALMA" in self.modelPath:
                     self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
                     self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
-                    self.transModel = ctranslate2.Generator(self.modelPath, device=self.device)
             elif "mt5" in self.modelPath:
                 self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
                 self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
@@ -160,16 +166,24 @@ class TranslationModel:
                 self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             elif "ALMA" in self.modelPath:
                 self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
-                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
-                transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
-                if self.device == "cpu":
-                    # ALMA is an excellent translation model, but it is strongly discouraged to operate it on CPU.
-                    # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
-                    transModelConfig.quantization_config["use_exllama"] = False
-                    self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
-                else:
-                    # transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
-                    self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
                 self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
             else:
                 self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
@@ -180,22 +194,31 @@ class TranslationModel:
                     self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
         except Exception as e:
-            print(traceback.format_exc())
             self.release_vram()
     def release_vram(self):
         try:
             if torch.cuda.is_available():
                 if "ct2" not in self.modelPath:
                     try:
-                        device = torch.device("cpu")
-                        self.transModel.to(device)
                     except Exception as e:
                         print(traceback.format_exc())
                         print("\tself.transModel.to cpu, error: " + str(e))
-                    del self.transTranslator
-                del self.transTokenizer
-                del self.transModel
                 try:
                     torch.cuda.empty_cache()
                 except Exception as e:
@@ -205,6 +228,7 @@ class TranslationModel:
                 gc.collect()
                 print("release vram end.")
         except Exception as e:
             print("Error release vram: " + str(e))
@@ -257,7 +281,10 @@ class TranslationModel:
                 output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
                 result = output[0]['generated_text']
             elif "ALMA" in self.modelPath:
-                output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
                 result = output[0]['generated_text']
             else: #M2M100 & NLLB
                 output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
@@ -332,7 +359,8 @@ def download_model(
         "vocab.json", #m2m100
         "model.safetensors",
         "quantize_config.json",
-        "tokenizer.model"
     ]
     kwargs = {

         if os.path.isdir(modelConfig.url):
             self.modelPath = modelConfig.url
         else:
+            self.modelPath = modelConfig.url if getattr(modelConfig, "model_file", None) is not None else download_model(
                 modelConfig,
                 localFilesOnly=localFilesOnly,
                 cacheDir=downloadRoot,
             If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel.
             This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
             https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization.md#exllama
+        [ctransformers]
+        gpu_layers
+            means number of layers to run on GPU. Depending on how much GPU memory is available you can increase gpu_layers. Start with a larger value gpu_layers=100 and if it runs out of memory, try smaller values.
+            To run some of the model layers on GPU, set the `gpu_layers` parameter
+            https://github.com/marella/ctransformers/issues/68
         """
         try:
             print('\n\nLoading model: %s\n\n' % self.modelPath)
                 elif "ALMA" in self.modelPath:
                     self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath)
                     self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
+                    self.transModel = ctranslate2.Generator(self.modelPath, compute_type="auto", device=self.device)
             elif "mt5" in self.modelPath:
                 self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
                 self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
                 self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             elif "ALMA" in self.modelPath:
                 self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
+                if "GPTQ" in self.modelPath:
+                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
+                    if self.device == "cpu":
+                        # Due to the poor support of GPTQ for CPUs, Therefore, it is strongly discouraged to operate it on CPU.
+                        # set torch_dtype=torch.float32 to prevent the occurrence of the exception "addmm_impl_cpu_ not implemented for 'Half'."
+                        transModelConfig = transformers.AutoConfig.from_pretrained(self.modelPath)
+                        transModelConfig.quantization_config["use_exllama"] = False
+                        self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision, config=transModelConfig, torch_dtype=torch.float32)
+                    else:
+                        # transModelConfig.quantization_config["exllama_config"] = {"version":2} # After configuring to use ExLlamaV2, VRAM cannot be effectively released, which may be an issue. Temporarily not adopting the V2 version.
+                        self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=False, revision=self.modelConfig.revision)
+                elif "GGUF" in self.modelPath:
+                    import ctransformers
+                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
+                    if self.device == "cpu":
+                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file)
+                    else:
+                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50)
                 self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
             else:
                 self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
                     self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
         except Exception as e:
             self.release_vram()
+            raise e
     def release_vram(self):
         try:
             if torch.cuda.is_available():
                 if "ct2" not in self.modelPath:
                     try:
+                        if getattr(self, "transModel", None) is not None:
+                            device = torch.device("cpu")
+                            self.transModel.to(device)
                     except Exception as e:
                         print(traceback.format_exc())
                         print("\tself.transModel.to cpu, error: " + str(e))
+                    if getattr(self, "transTranslator", None) is not None:
+                        del self.transTranslator
+                if "ct2" in self.modelPath:
+                    if getattr(self, "transModel", None) is not None and getattr(self.transModel, "unload_model", None) is not None:
+                        self.transModel.unload_model()
+                if getattr(self, "transTokenizer", None) is not None:
+                    del self.transTokenizer
+                if getattr(self, "transModel", None) is not None:
+                    del self.transModel
                 try:
                     torch.cuda.empty_cache()
                 except Exception as e:
                 gc.collect()
                 print("release vram end.")
         except Exception as e:
+            print(traceback.format_exc())
             print("Error release vram: " + str(e))
                 output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
                 result = output[0]['generated_text']
             elif "ALMA" in self.modelPath:
+                if "GPTQ" in self.modelPath:
+                    output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
+                elif "GGUF" in self.modelPath:
+                    output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
                 result = output[0]['generated_text']
             else: #M2M100 & NLLB
                 output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
         "vocab.json", #m2m100
         "model.safetensors",
         "quantize_config.json",
+        "tokenizer.model",
+        "vocabulary.json"
     ]
     kwargs = {