Illumotion commited on
Commit
1ec6819
·
1 Parent(s): 3d3c4d2

Upload folder using huggingface_hub

Browse files
.flake8 ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [flake8]
2
+ max-line-length = 125
.gitignore CHANGED
@@ -22,6 +22,7 @@ build-metal/
22
  build-no-accel/
23
  build-sanitize-addr/
24
  build-sanitize-thread/
 
25
 
26
  /main
27
  /quantize
@@ -29,13 +30,16 @@ build-sanitize-thread/
29
  /result
30
  /perplexity
31
  /embedding
 
32
  /benchmark-matmult
33
  /vdot
 
34
  /Pipfile
35
  /libllama.so
36
 
37
  arm_neon.h
38
  compile_commands.json
 
39
 
40
  __pycache__
41
 
 
22
  build-no-accel/
23
  build-sanitize-addr/
24
  build-sanitize-thread/
25
+ out/
26
 
27
  /main
28
  /quantize
 
30
  /result
31
  /perplexity
32
  /embedding
33
+ /train-text-from-scratch
34
  /benchmark-matmult
35
  /vdot
36
+ /server
37
  /Pipfile
38
  /libllama.so
39
 
40
  arm_neon.h
41
  compile_commands.json
42
+ CMakeSettings.json
43
 
44
  __pycache__
45
 
.pre-commit-config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See https://pre-commit.com for more information
2
+ # See https://pre-commit.com/hooks.html for more hooks
3
+ exclude: prompts/.*.txt
4
+ repos:
5
+ - repo: https://github.com/pre-commit/pre-commit-hooks
6
+ rev: v3.2.0
7
+ hooks:
8
+ - id: trailing-whitespace
9
+ - id: end-of-file-fixer
10
+ - id: check-yaml
11
+ - id: check-added-large-files
12
+ - repo: https://github.com/PyCQA/flake8
13
+ rev: 6.0.0
14
+ hooks:
15
+ - id: flake8
Dockerfile CHANGED
@@ -2,10 +2,9 @@ FROM python
2
  WORKDIR /app
3
  COPY . .
4
  RUN apt update \
5
- && apt install build-essential wget libopenblas-dev libclblast-dev make -y \
6
- && make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 \
7
  && wget https://huggingface.co/xzuyn/GPT-J-Shinen-6B-GGML/resolve/main/ggjtv1-model-q5_1.bin \
8
- && apt remove build-essential wget make -y \
9
- && apt autoremove -y
10
 
11
- ENTRYPOINT ["python", "koboldcpp.py", "ggjtv1-model-q5_1.bin", "--port", "7860", "--smartcontext", "--useclblast"]
 
2
  WORKDIR /app
3
  COPY . .
4
  RUN apt update \
5
+ && apt install build-essential wget libopenblas-dev make -y \
6
+ && make \
7
  && wget https://huggingface.co/xzuyn/GPT-J-Shinen-6B-GGML/resolve/main/ggjtv1-model-q5_1.bin \
8
+ && apt remove build-essential wget make -y
 
9
 
10
+ ENTRYPOINT ["python", "koboldcpp.py", "ggjtv1-model-q5_1.bin", "--port", "7860", "--smartcontext", "--useclblast", "{0, 1, 2, 3}"]
Package.swift CHANGED
@@ -11,6 +11,7 @@ let package = Package(
11
  .target(
12
  name: "llama",
13
  path: ".",
 
14
  sources: ["ggml.c", "llama.cpp"],
15
  publicHeadersPath: "spm-headers",
16
  cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
 
11
  .target(
12
  name: "llama",
13
  path: ".",
14
+ exclude: ["ggml-metal.metal"],
15
  sources: ["ggml.c", "llama.cpp"],
16
  publicHeadersPath: "spm-headers",
17
  cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
convert.py CHANGED
@@ -512,7 +512,11 @@ class LazyTensor:
512
  if not isinstance(self.data_type, QuantizedDataType):
513
  raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
514
  if self.data_type.have_g_idx:
515
- sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
 
 
 
 
516
  sys.exit(1)
517
  assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
518
 
@@ -694,8 +698,9 @@ class LazyUnpickler(pickle.Unpickler):
694
  description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
695
  return LazyStorage(load=load, kind=pid[1], description=description)
696
 
697
- # @staticmethod
698
- def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
 
699
  requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
700
  assert isinstance(storage, LazyStorage)
701
 
@@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
812
  # Use mmap for the actual data to avoid race conditions with the file offset.
813
  off = fp.raw.tell()
814
  mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
815
- fp.raw.seek(off) # needed on Windows
816
 
817
  def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
818
  shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
1054
  files = list(path.glob("model-00001-of-*.safetensors"))
1055
  if not files:
1056
  # Try the PyTorch patterns too, with lower priority
1057
- globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
1058
  files = [file for glob in globs for file in path.glob(glob)]
1059
  if not files:
1060
  # Try GGML too, but with lower priority, since if both a non-GGML
@@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
1094
  elif path3.exists():
1095
  path = path3
1096
  else:
1097
- raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
 
 
1098
  added_tokens_path = path.parent / "added_tokens.json"
1099
  print(f"Loading vocab file {path}")
1100
  return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
1110
  }[params.file_type]
1111
  ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
1112
  if ret in model_paths:
1113
- sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n")
 
 
1114
  sys.exit(1)
1115
  return ret
1116
 
@@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
1131
  parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
1132
  parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1133
  parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1134
- parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
 
1135
  args = parser.parse_args(args_in)
1136
 
1137
  vocab: Vocab
 
512
  if not isinstance(self.data_type, QuantizedDataType):
513
  raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
514
  if self.data_type.have_g_idx:
515
+ sys.stderr.write(
516
+ "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
517
+ "which is not yet natively supported by GGML. "
518
+ "For now you can still convert this model by passing `--outtype f16` to dequantize, "
519
+ "but that will result in a much larger output file for no quality benefit.\n")
520
  sys.exit(1)
521
  assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
522
 
 
698
  description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
699
  return LazyStorage(load=load, kind=pid[1], description=description)
700
 
701
+ # @staticmethod
702
+ def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
703
+ # pyright: ignore[reportSelfClsParameterName]
704
  requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
705
  assert isinstance(storage, LazyStorage)
706
 
 
817
  # Use mmap for the actual data to avoid race conditions with the file offset.
818
  off = fp.raw.tell()
819
  mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
820
+ fp.raw.seek(off) # needed on Windows
821
 
822
  def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
823
  shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
 
1059
  files = list(path.glob("model-00001-of-*.safetensors"))
1060
  if not files:
1061
  # Try the PyTorch patterns too, with lower priority
1062
+ globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1063
  files = [file for glob in globs for file in path.glob(glob)]
1064
  if not files:
1065
  # Try GGML too, but with lower priority, since if both a non-GGML
 
1099
  elif path3.exists():
1100
  path = path3
1101
  else:
1102
+ raise FileNotFoundError(
1103
+ f"Could not find tokenizer.model in {path} or its parent; "
1104
+ "if it's in another directory, pass the directory as --vocab-dir")
1105
  added_tokens_path = path.parent / "added_tokens.json"
1106
  print(f"Loading vocab file {path}")
1107
  return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
 
1117
  }[params.file_type]
1118
  ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
1119
  if ret in model_paths:
1120
+ sys.stderr.write(
1121
+ f"Error: Default output path ({ret}) would overwrite the input. "
1122
+ "Please explicitly specify a path using --outfile.\n")
1123
  sys.exit(1)
1124
  return ret
1125
 
 
1140
  parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
1141
  parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1142
  parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1143
+ parser.add_argument("model", type=Path,
1144
+ help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1145
  args = parser.parse_args(args_in)
1146
 
1147
  vocab: Vocab
examples/baby-llama/baby-llama.cpp CHANGED
@@ -4,6 +4,10 @@
4
  #include <random>
5
  #include <cstring>
6
 
 
 
 
 
7
  float frand() {
8
  return (float)rand()/(float)RAND_MAX;
9
  }
@@ -1470,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te
1470
  }
1471
 
1472
  struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
1473
- const float eps = 1e-3;
1474
  return
1475
  ggml_sum(ctx,
1476
  ggml_neg(ctx,
 
4
  #include <random>
5
  #include <cstring>
6
 
7
+ #if defined(_MSC_VER)
8
+ #pragma warning(disable: 4244 4267) // possible loss of data
9
+ #endif
10
+
11
  float frand() {
12
  return (float)rand()/(float)RAND_MAX;
13
  }
 
1474
  }
1475
 
1476
  struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
1477
+ const float eps = 1e-3f;
1478
  return
1479
  ggml_sum(ctx,
1480
  ggml_neg(ctx,
examples/benchmark/benchmark-matmult.cpp CHANGED
@@ -16,6 +16,10 @@
16
  #include <iterator>
17
  #include <algorithm>
18
 
 
 
 
 
19
  float tensor_sum_elements(const ggml_tensor * tensor) {
20
  float sum = 0;
21
  if (tensor->type==GGML_TYPE_F32) {
@@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
29
  }
30
 
31
  void tensor_dump(const ggml_tensor * tensor, const char * name) {
32
- printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
33
  tensor->type, ggml_type_name(tensor->type),
34
- (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
35
  float sum = tensor_sum_elements(tensor);
36
  printf("Sum of tensor %s is %6.2f\n", name, sum);
37
  }
@@ -120,7 +124,7 @@ int main(int argc, char ** argv) {
120
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
121
  ctx_size += 1024*1024*16;
122
 
123
- printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
124
 
125
  struct ggml_init_params params = {
126
  /*.mem_size =*/ ctx_size,
 
16
  #include <iterator>
17
  #include <algorithm>
18
 
19
+ #if defined(_MSC_VER)
20
+ #pragma warning(disable: 4244 4267) // possible loss of data
21
+ #endif
22
+
23
  float tensor_sum_elements(const ggml_tensor * tensor) {
24
  float sum = 0;
25
  if (tensor->type==GGML_TYPE_F32) {
 
33
  }
34
 
35
  void tensor_dump(const ggml_tensor * tensor, const char * name) {
36
+ printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
37
  tensor->type, ggml_type_name(tensor->type),
38
+ tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
39
  float sum = tensor_sum_elements(tensor);
40
  printf("Sum of tensor %s is %6.2f\n", name, sum);
41
  }
 
124
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
125
  ctx_size += 1024*1024*16;
126
 
127
+ printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
128
 
129
  struct ggml_init_params params = {
130
  /*.mem_size =*/ ctx_size,
examples/chat-vicuna.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ cd "$(dirname "$0")/.." || exit
6
+
7
+ MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
8
+ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
9
+ USER_NAME="### Human"
10
+ AI_NAME="### Assistant"
11
+
12
+ # Adjust to the number of CPU cores you want to use.
13
+ N_THREAD="${N_THREAD:-8}"
14
+ # Number of tokens to predict (made it larger than default because we want a long interaction)
15
+ N_PREDICTS="${N_PREDICTS:-2048}"
16
+
17
+ # Note: you can also override the generation options by specifying them on the command line:
18
+ # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
19
+ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
20
+
21
+ DATE_TIME=$(date +%H:%M)
22
+ DATE_YEAR=$(date +%Y)
23
+
24
+ PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
25
+
26
+ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
27
+ -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
28
+ -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
29
+ -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
30
+ $PROMPT_TEMPLATE > $PROMPT_FILE
31
+
32
+ # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
33
+ ./bin/main $GEN_OPTIONS \
34
+ --model "$MODEL" \
35
+ --threads "$N_THREAD" \
36
+ --n_predict "$N_PREDICTS" \
37
+ --color --interactive \
38
+ --file ${PROMPT_FILE} \
39
+ --reverse-prompt "### Human:" \
40
+ --in-prefix ' ' \
41
+ "$@"
examples/common.cpp CHANGED
@@ -28,6 +28,10 @@
28
  #include <wchar.h>
29
  #endif
30
 
 
 
 
 
31
  int32_t get_num_physical_cores() {
32
  #ifdef __linux__
33
  // enumerate the set of thread siblings, num entries is num cores
@@ -331,6 +335,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
331
  }
332
  #else
333
  fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 
 
 
 
 
 
334
  #endif // GGML_USE_CUBLAS
335
  } else if (arg == "--no-mmap") {
336
  params.use_mmap = false;
@@ -367,7 +377,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
367
  } else {
368
  throw std::exception();
369
  }
370
- } catch (const std::exception &e) {
371
  invalid_param = true;
372
  break;
373
  }
@@ -406,6 +416,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
406
  gpt_print_usage(argc, argv, default_params);
407
  exit(1);
408
  }
 
 
 
 
 
 
 
 
409
  if (escape_prompt) {
410
  process_escapes(params.prompt);
411
  }
@@ -479,6 +497,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
479
  fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
480
  fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
481
  fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
 
482
  #endif
483
  fprintf(stderr, " --mtest compute maximum memory usage\n");
484
  fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
@@ -528,6 +547,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
528
  lparams.n_gpu_layers = params.n_gpu_layers;
529
  lparams.main_gpu = params.main_gpu;
530
  memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
 
531
  lparams.seed = params.seed;
532
  lparams.f16_kv = params.memory_f16;
533
  lparams.use_mmap = params.use_mmap;
 
28
  #include <wchar.h>
29
  #endif
30
 
31
+ #if defined(_MSC_VER)
32
+ #pragma warning(disable: 4244 4267) // possible loss of data
33
+ #endif
34
+
35
  int32_t get_num_physical_cores() {
36
  #ifdef __linux__
37
  // enumerate the set of thread siblings, num entries is num cores
 
335
  }
336
  #else
337
  fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
338
+ #endif // GGML_USE_CUBLAS
339
+ } else if (arg == "--low-vram" || arg == "-lv") {
340
+ #ifdef GGML_USE_CUBLAS
341
+ params.low_vram = true;
342
+ #else
343
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
344
  #endif // GGML_USE_CUBLAS
345
  } else if (arg == "--no-mmap") {
346
  params.use_mmap = false;
 
377
  } else {
378
  throw std::exception();
379
  }
380
+ } catch (const std::exception&) {
381
  invalid_param = true;
382
  break;
383
  }
 
416
  gpt_print_usage(argc, argv, default_params);
417
  exit(1);
418
  }
419
+
420
+ #ifdef GGML_USE_CUBLAS
421
+ if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
422
+ fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
423
+ exit(1);
424
+ }
425
+ #endif // GGML_USE_CUBLAS
426
+
427
  if (escape_prompt) {
428
  process_escapes(params.prompt);
429
  }
 
497
  fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
498
  fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
499
  fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
500
+ fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
501
  #endif
502
  fprintf(stderr, " --mtest compute maximum memory usage\n");
503
  fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
 
547
  lparams.n_gpu_layers = params.n_gpu_layers;
548
  lparams.main_gpu = params.main_gpu;
549
  memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
550
+ lparams.low_vram = params.low_vram;
551
  lparams.seed = params.seed;
552
  lparams.f16_kv = params.memory_f16;
553
  lparams.use_mmap = params.use_mmap;
examples/common.h CHANGED
@@ -21,15 +21,16 @@
21
  int32_t get_num_physical_cores();
22
 
23
  struct gpt_params {
24
- int32_t seed = -1; // RNG seed
25
- int32_t n_threads = get_num_physical_cores();
26
- int32_t n_predict = -1; // new tokens to predict
27
- int32_t n_ctx = 512; // context size
28
- int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
30
- int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
32
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
 
33
 
34
  // sampling parameters
35
  std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
 
21
  int32_t get_num_physical_cores();
22
 
23
  struct gpt_params {
24
+ int32_t seed = -1; // RNG seed
25
+ int32_t n_threads = get_num_physical_cores();
26
+ int32_t n_predict = -1; // new tokens to predict
27
+ int32_t n_ctx = 512; // context size
28
+ int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
30
+ int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
32
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
33
+ bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
34
 
35
  // sampling parameters
36
  std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
examples/embedding/embedding.cpp CHANGED
@@ -4,6 +4,10 @@
4
 
5
  #include <ctime>
6
 
 
 
 
 
7
  int main(int argc, char ** argv) {
8
  gpt_params params;
9
 
 
4
 
5
  #include <ctime>
6
 
7
+ #if defined(_MSC_VER)
8
+ #pragma warning(disable: 4244 4267) // possible loss of data
9
+ #endif
10
+
11
  int main(int argc, char ** argv) {
12
  gpt_params params;
13
 
examples/jeopardy/graph.py CHANGED
@@ -1,5 +1,5 @@
1
  import matplotlib.pyplot as plt
2
- import sys, os
3
  import csv
4
 
5
  labels = []
@@ -8,6 +8,7 @@ numEntries = 1
8
 
9
  rows = []
10
 
 
11
  def bar_chart(numbers, labels, pos):
12
  plt.bar(pos, numbers, color='blue')
13
  plt.xticks(ticks=pos, labels=labels)
@@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
16
  plt.ylabel("Questions Correct")
17
  plt.show()
18
 
 
19
  def calculatecorrect():
20
  directory = os.fsencode("./examples/jeopardy/results/")
21
  csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
@@ -38,14 +40,13 @@ def calculatecorrect():
38
  print(line)
39
  else:
40
  print("Correct answer: " + rows[i][2] + "\n")
41
- i+=1
42
  print("Did the AI get the question right? (y/n)")
43
  if input() == "y":
44
  totalcorrect += 1
45
  numbers.append(totalcorrect)
46
 
47
 
48
-
49
  if __name__ == '__main__':
50
  calculatecorrect()
51
  pos = list(range(numEntries))
 
1
  import matplotlib.pyplot as plt
2
+ import os
3
  import csv
4
 
5
  labels = []
 
8
 
9
  rows = []
10
 
11
+
12
  def bar_chart(numbers, labels, pos):
13
  plt.bar(pos, numbers, color='blue')
14
  plt.xticks(ticks=pos, labels=labels)
 
17
  plt.ylabel("Questions Correct")
18
  plt.show()
19
 
20
+
21
  def calculatecorrect():
22
  directory = os.fsencode("./examples/jeopardy/results/")
23
  csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
 
40
  print(line)
41
  else:
42
  print("Correct answer: " + rows[i][2] + "\n")
43
+ i += 1
44
  print("Did the AI get the question right? (y/n)")
45
  if input() == "y":
46
  totalcorrect += 1
47
  numbers.append(totalcorrect)
48
 
49
 
 
50
  if __name__ == '__main__':
51
  calculatecorrect()
52
  pos = list(range(numEntries))
examples/main/README.md CHANGED
@@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
288
  - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
289
  - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
290
  - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 
291
  - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
292
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 
288
  - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
289
  - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
290
  - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
291
+ - `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
292
  - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
293
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
examples/main/main.cpp CHANGED
@@ -23,11 +23,17 @@
23
  #include <unistd.h>
24
  #elif defined (_WIN32)
25
  #define WIN32_LEAN_AND_MEAN
 
26
  #define NOMINMAX
 
27
  #include <windows.h>
28
  #include <signal.h>
29
  #endif
30
 
 
 
 
 
31
  static console_state con_st;
32
  static llama_context ** g_ctx;
33
 
@@ -348,7 +354,7 @@ int main(int argc, char ** argv) {
348
  if ((int)embd.size() > max_embd_size) {
349
  auto skipped_tokens = embd.size() - max_embd_size;
350
  console_set_color(con_st, CONSOLE_COLOR_ERROR);
351
- printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
352
  console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
353
  fflush(stdout);
354
  embd.resize(max_embd_size);
 
23
  #include <unistd.h>
24
  #elif defined (_WIN32)
25
  #define WIN32_LEAN_AND_MEAN
26
+ #ifndef NOMINMAX
27
  #define NOMINMAX
28
+ #endif
29
  #include <windows.h>
30
  #include <signal.h>
31
  #endif
32
 
33
+ #if defined(_MSC_VER)
34
+ #pragma warning(disable: 4244 4267) // possible loss of data
35
+ #endif
36
+
37
  static console_state con_st;
38
  static llama_context ** g_ctx;
39
 
 
354
  if ((int)embd.size() > max_embd_size) {
355
  auto skipped_tokens = embd.size() - max_embd_size;
356
  console_set_color(con_st, CONSOLE_COLOR_ERROR);
357
+ printf("<<input too long: skipped %" PRIu64 " token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
358
  console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
359
  fflush(stdout);
360
  embd.resize(max_embd_size);
examples/perplexity/perplexity.cpp CHANGED
@@ -5,6 +5,10 @@
5
  #include <cmath>
6
  #include <ctime>
7
 
 
 
 
 
8
  std::vector<float> softmax(const std::vector<float>& logits) {
9
  std::vector<float> probs(logits.size());
10
  float max_logit = logits[0];
 
5
  #include <cmath>
6
  #include <ctime>
7
 
8
+ #if defined(_MSC_VER)
9
+ #pragma warning(disable: 4244 4267) // possible loss of data
10
+ #endif
11
+
12
  std::vector<float> softmax(const std::vector<float>& logits) {
13
  std::vector<float> probs(logits.size());
14
  float max_logit = logits[0];
examples/quantize-stats/quantize-stats.cpp CHANGED
@@ -19,6 +19,10 @@
19
  #include <thread>
20
  #include <mutex>
21
 
 
 
 
 
22
  struct quantize_stats_params {
23
  std::string model = "models/7B/ggml-model-f16.bin";
24
  bool verbose = false;
 
19
  #include <thread>
20
  #include <mutex>
21
 
22
+ #if defined(_MSC_VER)
23
+ #pragma warning(disable: 4244 4267) // possible loss of data
24
+ #endif
25
+
26
  struct quantize_stats_params {
27
  std::string model = "models/7B/ggml-model-f16.bin";
28
  bool verbose = false;
examples/save-load-state/save-load-state.cpp CHANGED
@@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
37
  // init
38
  auto ctx = llama_init_from_file(params.model.c_str(), lparams);
39
  auto tokens = std::vector<llama_token>(params.n_ctx);
40
- auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
41
 
42
  if (n_prompt_tokens < 1) {
43
  fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
 
37
  // init
38
  auto ctx = llama_init_from_file(params.model.c_str(), lparams);
39
  auto tokens = std::vector<llama_token>(params.n_ctx);
40
+ auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
41
 
42
  if (n_prompt_tokens < 1) {
43
  fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
examples/server/CMakeLists.txt CHANGED
@@ -1,6 +1,10 @@
1
  set(TARGET server)
 
2
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
3
  add_executable(${TARGET} server.cpp json.hpp httplib.h)
 
 
 
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
  if(TARGET BUILD_INFO)
 
1
  set(TARGET server)
2
+ option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
3
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
4
  add_executable(${TARGET} server.cpp json.hpp httplib.h)
5
+ target_compile_definitions(${TARGET} PRIVATE
6
+ SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
7
+ )
8
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
9
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
10
  if(TARGET BUILD_INFO)
examples/server/README.md CHANGED
@@ -1,33 +1,74 @@
1
  # llama.cpp/example/server
2
 
3
- This example allow you to have a llama.cpp http server to interact from a web page or consume the API.
4
 
5
- ## Table of Contents
6
 
7
- 1. [Quick Start](#quick-start)
8
- 2. [Node JS Test](#node-js-test)
9
- 3. [API Endpoints](#api-endpoints)
10
- 4. [More examples](#more-examples)
11
- 5. [Common Options](#common-options)
12
- 6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  ## Quick Start
15
 
16
  To get started right away, run the following command, making sure to use the correct path for the model you have:
17
 
18
- #### Unix-based systems (Linux, macOS, etc.):
19
 
20
  ```bash
21
- ./server -m models/7B/ggml-model.bin --ctx_size 2048
22
  ```
23
 
24
- #### Windows:
25
 
26
  ```powershell
27
- server.exe -m models\7B\ggml-model.bin --ctx_size 2048
28
  ```
29
 
30
- That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library.
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  ## Node JS Test
33
 
@@ -50,7 +91,6 @@ const prompt = `Building a website can be done in 10 simple steps:`;
50
  async function Test() {
51
  let result = await axios.post("http://127.0.0.1:8080/completion", {
52
  prompt,
53
- batch_size: 128,
54
  n_predict: 512,
55
  });
56
 
@@ -69,246 +109,75 @@ node .
69
 
70
  ## API Endpoints
71
 
72
- You can interact with this API Endpoints. This implementations just support chat style interaction.
73
 
74
- - **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
75
 
76
- *Options:*
77
 
78
- `batch_size`: Set the batch size for prompt processing (default: 512).
79
 
80
- `temperature`: Adjust the randomness of the generated text (default: 0.8).
81
 
82
- `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
83
 
84
- `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
 
85
 
86
- `n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
87
 
88
- `threads`: Set the number of threads to use during computation.
89
 
90
- `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
 
91
 
92
- `as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
93
 
94
- `interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`.
95
 
96
- `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
97
 
98
- `stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
99
 
100
- `exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
101
 
102
- - **POST** `hostname:port/embedding`: Generate embedding of a given text
103
 
104
- *Options:*
105
 
106
- `content`: Set the text to get generate the embedding.
107
 
108
- `threads`: Set the number of threads to use during computation.
109
 
110
- To use this endpoint, you need to start the server with the `--embedding` option added.
111
 
112
- - **POST** `hostname:port/tokenize`: Tokenize a given text
113
 
114
- *Options:*
115
 
116
- `content`: Set the text to tokenize.
117
 
118
- - **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
119
 
120
- *Options:*
121
 
122
- `stop`: Set `hostname:port/next-token?stop=true` to stop the token generation.
123
 
124
  ## More examples
125
 
126
  ### Interactive mode
127
 
128
- This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model.
129
-
130
- The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well.
131
-
132
- This example works well for `Vicuna - version 1`.
133
-
134
- ```javascript
135
- const axios = require("axios");
136
-
137
- let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
138
- ### Human: Hello, Assistant.
139
- ### Assistant: Hello. How may I help you today?
140
- ### Human: Please tell me the largest city in Europe.
141
- ### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`;
142
-
143
- async function ChatCompletion(answer) {
144
- // the user's next question to the prompt
145
- prompt += `\n### Human: ${answer}\n`
146
-
147
- result = await axios.post("http://127.0.0.1:8080/completion", {
148
- prompt,
149
- batch_size: 128,
150
- temperature: 0.2,
151
- top_k: 40,
152
- top_p: 0.9,
153
- n_keep: -1,
154
- n_predict: 2048,
155
- stop: ["\n### Human:"], // when detect this, stop completion
156
- exclude: ["### Assistant:"], // no show in the completion
157
- threads: 8,
158
- as_loop: true, // use this to request the completion token by token
159
- interactive: true, // enable the detection of a stop word
160
- });
161
-
162
- // create a loop to receive every token predicted
163
- // note: this operation is blocking, avoid use this in a ui thread
164
-
165
- let message = "";
166
- while (true) {
167
- // you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true
168
- result = await axios.get("http://127.0.0.1:8080/next-token");
169
- process.stdout.write(result.data.content);
170
- message += result.data.content;
171
-
172
- // to avoid an infinite loop
173
- if (result.data.stop) {
174
- console.log("Completed");
175
- // make sure to add the completion to the prompt.
176
- prompt += `### Assistant: ${message}`;
177
- break;
178
- }
179
- }
180
- }
181
-
182
- // This function should be called every time a question to the model is needed.
183
- async function Test() {
184
- // the server can't inference in paralell
185
- await ChatCompletion("Write a long story about a time magician in a fantasy world");
186
- await ChatCompletion("Summary the story");
187
- }
188
-
189
- Test();
190
- ```
191
-
192
- ### Alpaca example
193
-
194
- **Temporaly note:** no tested, if you have the model, please test it and report me some issue
195
-
196
- ```javascript
197
- const axios = require("axios");
198
-
199
- let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
200
- `;
201
-
202
- async function DoInstruction(instruction) {
203
- prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`;
204
- result = await axios.post("http://127.0.0.1:8080/completion", {
205
- prompt,
206
- batch_size: 128,
207
- temperature: 0.2,
208
- top_k: 40,
209
- top_p: 0.9,
210
- n_keep: -1,
211
- n_predict: 2048,
212
- stop: ["### Instruction:\n\n"], // when detect this, stop completion
213
- exclude: [], // no show in the completion
214
- threads: 8,
215
- as_loop: true, // use this to request the completion token by token
216
- interactive: true, // enable the detection of a stop word
217
- });
218
-
219
- // create a loop to receive every token predicted
220
- // note: this operation is blocking, avoid use this in a ui thread
221
-
222
- let message = "";
223
- while (true) {
224
- result = await axios.get("http://127.0.0.1:8080/next-token");
225
- process.stdout.write(result.data.content);
226
- message += result.data.content;
227
-
228
- // to avoid an infinite loop
229
- if (result.data.stop) {
230
- console.log("Completed");
231
- // make sure to add the completion and the user's next question to the prompt.
232
- prompt += message;
233
- break;
234
- }
235
- }
236
- }
237
-
238
- // This function should be called every time a instruction to the model is needed.
239
- DoInstruction("Destroy the world"); // as joke
240
- ```
241
-
242
- ### Embeddings
243
-
244
- First, run the server with `--embedding` option:
245
-
246
- ```bash
247
- server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding
248
- ```
249
-
250
- Run this code in NodeJS:
251
 
252
- ```javascript
253
- const axios = require('axios');
254
-
255
- async function Test() {
256
- let result = await axios.post("http://127.0.0.1:8080/embedding", {
257
- content: `Hello`,
258
- threads: 5
259
- });
260
- // print the embedding array
261
- console.log(result.data.embedding);
262
- }
263
-
264
- Test();
265
  ```
266
 
267
- ### Tokenize
268
-
269
- Run this code in NodeJS:
270
-
271
- ```javascript
272
- const axios = require('axios');
273
-
274
- async function Test() {
275
- let result = await axios.post("http://127.0.0.1:8080/tokenize", {
276
- content: `Hello`
277
- });
278
- // print the embedding array
279
- console.log(result.data.tokens);
280
- }
281
 
282
- Test();
 
283
  ```
284
-
285
- ## Common Options
286
-
287
- - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
288
- - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
289
- - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
290
- - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
291
- - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
292
- - `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
293
- - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
294
- - `--port`: Set the port to listen. Default: `8080`.
295
-
296
- ### RNG Seed
297
-
298
- - `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
299
-
300
- The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
301
-
302
- ## Performance Tuning and Memory Options
303
-
304
- ### No Memory Mapping
305
-
306
- - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
307
-
308
- ### Memory Float 32
309
-
310
- - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
311
-
312
- ## Limitations:
313
-
314
- - The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware.
 
1
  # llama.cpp/example/server
2
 
3
+ This example demonstrates a simple HTTP API server to interact with llama.cpp.
4
 
5
+ Command line options:
6
 
7
+ - `--threads N`, `-t N`: Set the number of threads to use during computation.
8
+ - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
9
+ - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
10
+ - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
11
+ - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
12
+ - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
13
+ - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
14
+ - `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
15
+ - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
16
+ - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
17
+ - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
18
+ - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19
+ - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
20
+ - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
21
+ - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
22
+ - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
23
+ - `--port`: Set the port to listen. Default: `8080`.
24
+
25
+ ## Build
26
+
27
+ Build llama.cpp with server from repository root with either make or CMake.
28
+
29
+ - Using `make`:
30
+
31
+ ```bash
32
+ LLAMA_BUILD_SERVER=1 make
33
+ ```
34
+
35
+ - Using `CMake`:
36
+
37
+ ```bash
38
+ mkdir build-server
39
+ cd build-server
40
+ cmake -DLLAMA_BUILD_SERVER=ON ..
41
+ cmake --build . --config Release
42
+ ```
43
 
44
  ## Quick Start
45
 
46
  To get started right away, run the following command, making sure to use the correct path for the model you have:
47
 
48
+ ### Unix-based systems (Linux, macOS, etc.):
49
 
50
  ```bash
51
+ ./server -m models/7B/ggml-model.bin -c 2048
52
  ```
53
 
54
+ ### Windows:
55
 
56
  ```powershell
57
+ server.exe -m models\7B\ggml-model.bin -c 2048
58
  ```
59
 
60
+ The above command will start a server that by default listens on `127.0.0.1:8080`.
61
+ You can consume the endpoints with Postman or NodeJS with axios library.
62
+
63
+ ## Testing with CURL
64
+
65
+ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
66
+
67
+ ```sh
68
+ curl --request POST \
69
+ --url http://localhost:8080/completion \
70
+ --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
71
+ ```
72
 
73
  ## Node JS Test
74
 
 
91
  async function Test() {
92
  let result = await axios.post("http://127.0.0.1:8080/completion", {
93
  prompt,
 
94
  n_predict: 512,
95
  });
96
 
 
109
 
110
  ## API Endpoints
111
 
112
+ - **POST** `/completion`: Given a prompt, it returns the predicted completion.
113
 
114
+ *Options:*
115
 
116
+ `temperature`: Adjust the randomness of the generated text (default: 0.8).
117
 
118
+ `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
119
 
120
+ `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
121
 
122
+ `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity).
123
 
124
+ `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
125
+ By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
126
 
127
+ `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
128
 
129
+ `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
130
 
131
+ `stop`: Specify a JSON array of stopping strings.
132
+ These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
133
 
134
+ `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
135
 
136
+ `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
137
 
138
+ `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
139
 
140
+ `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
141
 
142
+ `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
143
 
144
+ `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
145
 
146
+ `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
147
 
148
+ `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
149
 
150
+ `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
151
 
152
+ `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
153
 
154
+ `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
155
 
156
+ `ignore_eos`: Ignore end of stream token and continue generating (default: false).
157
 
158
+ `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
159
 
160
+ - **POST** `/tokenize`: Tokenize a given text.
161
 
162
+ *Options:*
163
 
164
+ `content`: Set the text to tokenize.
165
 
166
  ## More examples
167
 
168
  ### Interactive mode
169
 
170
+ Check the sample in [chat.mjs](chat.mjs).
171
+ Run with NodeJS version 16 or later:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ ```sh
174
+ node chat.mjs
 
 
 
 
 
 
 
 
 
 
 
175
  ```
176
 
177
+ Another sample in [chat.sh](chat.sh).
178
+ Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/).
179
+ Run with bash:
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ ```sh
182
+ bash chat.sh
183
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/server/chat.mjs ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as readline from 'node:readline'
2
+ import { stdin, stdout } from 'node:process'
3
+
4
+ const API_URL = 'http://127.0.0.1:8080'
5
+
6
+ const chat = [
7
+ {
8
+ human: "Hello, Assistant.",
9
+ assistant: "Hello. How may I help you today?"
10
+ },
11
+ {
12
+ human: "Please tell me the largest city in Europe.",
13
+ assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
14
+ },
15
+ ]
16
+
17
+ const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
18
+
19
+ function format_prompt(question) {
20
+ return `${instruction}\n${
21
+ chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
22
+ }\n### Human: ${question}\n### Assistant:`
23
+ }
24
+
25
+ async function tokenize(content) {
26
+ const result = await fetch(`${API_URL}/tokenize`, {
27
+ method: 'POST',
28
+ body: JSON.stringify({ content })
29
+ })
30
+
31
+ if (!result.ok) {
32
+ return []
33
+ }
34
+
35
+ return await result.json().tokens
36
+ }
37
+
38
+ const n_keep = await tokenize(instruction).length
39
+
40
+ async function chat_completion(question) {
41
+ const result = await fetch(`${API_URL}/completion`, {
42
+ method: 'POST',
43
+ body: JSON.stringify({
44
+ prompt: format_prompt(question),
45
+ temperature: 0.2,
46
+ top_k: 40,
47
+ top_p: 0.9,
48
+ n_keep: n_keep,
49
+ n_predict: 256,
50
+ stop: ["\n### Human:"], // stop completion after generating this
51
+ stream: true,
52
+ })
53
+ })
54
+
55
+ if (!result.ok) {
56
+ return
57
+ }
58
+
59
+ let answer = ''
60
+
61
+ for await (var chunk of result.body) {
62
+ const t = Buffer.from(chunk).toString('utf8')
63
+ if (t.startsWith('data: ')) {
64
+ const message = JSON.parse(t.substring(6))
65
+ answer += message.content
66
+ process.stdout.write(message.content)
67
+ if (message.stop) {
68
+ if (message.truncated) {
69
+ chat.shift()
70
+ }
71
+ break
72
+ }
73
+ }
74
+ }
75
+
76
+ process.stdout.write('\n')
77
+ chat.push({ human: question, assistant: answer.trimStart() })
78
+ }
79
+
80
+ const rl = readline.createInterface({ input: stdin, output: stdout });
81
+
82
+ const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
83
+ rl.question(query, options, resolve)
84
+ });
85
+
86
+ while(true) {
87
+ const question = await readlineQuestion(rl, '> ')
88
+ await chat_completion(question)
89
+ }
examples/server/chat.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ API_URL="${API_URL:-http://127.0.0.1:8080}"
4
+
5
+ CHAT=(
6
+ "Hello, Assistant."
7
+ "Hello. How may I help you today?"
8
+ "Please tell me the largest city in Europe."
9
+ "Sure. The largest city in Europe is Moscow, the capital of Russia."
10
+ )
11
+
12
+ INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
13
+
14
+ trim() {
15
+ shopt -s extglob
16
+ set -- "${1##+([[:space:]])}"
17
+ printf "%s" "${1%%+([[:space:]])}"
18
+ }
19
+
20
+ trim_trailing() {
21
+ shopt -s extglob
22
+ printf "%s" "${1%%+([[:space:]])}"
23
+ }
24
+
25
+ format_prompt() {
26
+ echo -n "${INSTRUCTION}"
27
+ printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
28
+ }
29
+
30
+ tokenize() {
31
+ curl \
32
+ --silent \
33
+ --request POST \
34
+ --url "${API_URL}/tokenize" \
35
+ --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
36
+ | jq '.tokens[]'
37
+ }
38
+
39
+ N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
40
+
41
+ chat_completion() {
42
+ PROMPT="$(trim_trailing "$(format_prompt "$1")")"
43
+ DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
44
+ prompt: .,
45
+ temperature: 0.2,
46
+ top_k: 40,
47
+ top_p: 0.9,
48
+ n_keep: $n_keep,
49
+ n_predict: 256,
50
+ stop: ["\n### Human:"],
51
+ stream: true
52
+ }')"
53
+
54
+ ANSWER=''
55
+
56
+ while IFS= read -r LINE; do
57
+ if [[ $LINE = data:* ]]; then
58
+ CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
59
+ printf "%s" "${CONTENT}"
60
+ ANSWER+="${CONTENT}"
61
+ fi
62
+ done < <(curl \
63
+ --silent \
64
+ --no-buffer \
65
+ --request POST \
66
+ --url "${API_URL}/completion" \
67
+ --data-raw "${DATA}")
68
+
69
+ printf "\n"
70
+
71
+ CHAT+=("$1" "$(trim "$ANSWER")")
72
+ }
73
+
74
+ while true; do
75
+ read -r -e -p "> " QUESTION
76
+ chat_completion "${QUESTION}"
77
+ done
examples/server/server.cpp CHANGED
@@ -1,790 +1,928 @@
1
- #include <httplib.h>
2
- #include <json.hpp>
3
  #include "common.h"
4
  #include "llama.h"
 
5
 
6
- struct server_params
7
- {
8
- std::string hostname = "127.0.0.1";
9
- int32_t port = 8080;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  };
11
 
12
- struct llama_server_context
13
- {
14
- bool as_loop = false;
15
- bool has_next_token = false;
16
- std::string generated_text = "";
17
-
18
- int32_t num_tokens_predicted = 0;
19
- int32_t n_past = 0;
20
- int32_t n_consumed = 0;
21
- int32_t n_session_consumed = 0;
22
- int32_t n_remain = 0;
23
-
24
- std::vector<llama_token> embd;
25
- std::vector<llama_token> last_n_tokens;
26
- std::vector<llama_token> processed_tokens;
27
- std::vector<llama_token> llama_token_newline;
28
- std::vector<llama_token> embd_inp;
29
- std::vector<std::vector<llama_token>> no_show_words;
30
- std::vector<llama_token> tokens_predicted;
31
-
32
- llama_context *ctx;
33
- gpt_params params;
34
-
35
- void rewind() {
36
- as_loop = false;
37
- params.antiprompt.clear();
38
- no_show_words.clear();
39
- num_tokens_predicted = 0;
40
- generated_text = "";
41
- }
42
-
43
- bool loadModel(gpt_params params_)
44
- {
45
- params = params_;
46
- ctx = llama_init_from_gpt_params(params);
47
- if (ctx == NULL)
48
- {
49
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
50
- return false;
51
- }
52
- // determine newline token
53
- llama_token_newline = ::llama_tokenize(ctx, "\n", false);
54
- last_n_tokens.resize(params.n_ctx);
55
- std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
56
- return true;
57
- }
58
-
59
- bool loadPrompt() {
60
- params.prompt.insert(0, 1, ' '); // always add a first space
61
- std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
62
- // compare the evaluated prompt with the new prompt
63
- int new_prompt_len = 0;
64
- for (size_t i = 0; i < prompt_tokens.size(); i++) {
65
- if (i < processed_tokens.size() &&
66
- processed_tokens[i] == prompt_tokens[i])
67
- {
68
- continue;
69
- }
70
- else
71
- {
72
- embd_inp.push_back(prompt_tokens[i]);
73
- if(new_prompt_len == 0) {
74
- if(int32_t(i) - 1 < n_past) {
75
- processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
76
- }
77
- // Evaluate the new fragment prompt from the last token processed.
78
- n_past = processed_tokens.size();
79
  }
80
- new_prompt_len ++;
81
- }
82
- }
83
- if(n_past > 0 && params.interactive) {
84
- n_remain -= new_prompt_len;
85
  }
86
- if ((int)embd_inp.size() > params.n_ctx - 4)
87
- {
88
- return false;
 
 
 
 
 
89
  }
90
- has_next_token = true;
91
- return true;
92
- }
93
-
94
- void beginCompletion()
95
- {
96
- if(n_remain == 0) {
97
- // number of tokens to keep when resetting context
98
- if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size())
99
- {
100
- params.n_keep = (int)embd_inp.size();
101
- }
 
 
 
102
  }
103
- n_remain = params.n_predict;
104
- }
105
-
106
- llama_token nextToken() {
107
- llama_token result = -1;
108
- if (embd.size() > 0)
109
- {
110
- if (n_past + (int)embd.size() > params.n_ctx)
111
- {
112
- // Reset context
113
- const int n_left = n_past - params.n_keep;
114
- n_past = std::max(1, params.n_keep);
115
- processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end());
116
- embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size());
117
- }
118
- for (int i = 0; i < (int)embd.size(); i += params.n_batch)
119
- {
120
- int n_eval = (int)embd.size() - i;
121
- if (n_eval > params.n_batch)
122
- {
123
- n_eval = params.n_batch;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
125
- if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads))
126
- {
127
- fprintf(stderr, "%s : failed to eval\n", __func__);
128
- has_next_token = false;
129
- return result;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  }
131
- n_past += n_eval;
132
- }
 
 
133
  }
134
- embd.clear();
135
- if ((int)embd_inp.size() <= n_consumed && has_next_token)
136
- {
137
- // out of user input, sample next token
138
- const float temp = params.temp;
139
- // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
140
- const float top_p = params.top_p;
141
- const float tfs_z = params.tfs_z;
142
- const float typical_p = params.typical_p;
143
- const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
144
- const float repeat_penalty = params.repeat_penalty;
145
- const float alpha_presence = params.presence_penalty;
146
- const float alpha_frequency = params.frequency_penalty;
147
- const int mirostat = params.mirostat;
148
- const float mirostat_tau = params.mirostat_tau;
149
- const float mirostat_eta = params.mirostat_eta;
150
- const bool penalize_nl = params.penalize_nl;
151
- llama_token id = 0;
152
- {
153
- auto logits = llama_get_logits(ctx);
154
- auto n_vocab = llama_n_vocab(ctx);
155
-
156
- // Apply params.logit_bias map
157
- for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++)
158
- {
159
- logits[it->first] += it->second;
160
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- std::vector<llama_token_data> candidates;
163
- candidates.reserve(n_vocab);
164
- for (llama_token token_id = 0; token_id < n_vocab; token_id++)
165
- {
166
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
 
167
  }
168
 
169
- llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
170
-
171
- // Apply penalties
172
- float nl_logit = logits[llama_token_nl()];
173
- auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
174
- llama_sample_repetition_penalty(ctx, &candidates_p,
175
- last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
176
- last_n_repeat, repeat_penalty);
177
- llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
178
- last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
179
- last_n_repeat, alpha_frequency, alpha_presence);
180
- if (!penalize_nl)
181
- {
182
- logits[llama_token_nl()] = nl_logit;
183
  }
184
 
185
- if (temp <= 0)
186
- {
187
- // Greedy sampling
188
- id = llama_sample_token_greedy(ctx, &candidates_p);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  }
190
- else
191
- {
192
- if (mirostat == 1)
193
- {
194
- static float mirostat_mu = 2.0f * mirostat_tau;
195
- const int mirostat_m = 100;
196
- llama_sample_temperature(ctx, &candidates_p, temp);
197
- id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
198
- }
199
- else if (mirostat == 2)
200
- {
201
- static float mirostat_mu = 2.0f * mirostat_tau;
202
- llama_sample_temperature(ctx, &candidates_p, temp);
203
- id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
204
- }
205
- else
206
- {
207
- // Temperature sampling
208
- llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
209
- llama_sample_typical(ctx, &candidates_p, typical_p, 1);
210
- llama_sample_top_p(ctx, &candidates_p, top_p, 1);
211
- llama_sample_temperature(ctx, &candidates_p, temp);
212
- id = llama_sample_token(ctx, &candidates_p);
213
- }
214
  }
215
- last_n_tokens.erase(last_n_tokens.begin());
216
- last_n_tokens.push_back(id);
217
- processed_tokens.push_back(id);
218
- num_tokens_predicted++;
219
- }
220
-
221
- // replace end of text token with newline token when in interactive mode
222
- if (id == llama_token_eos() && params.interactive)
223
- {
224
- id = llama_token_newline.front();
225
- if (params.antiprompt.size() != 0)
 
 
 
 
 
 
226
  {
227
- // tokenize and inject first reverse prompt
228
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
229
- embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  }
231
- }
232
 
233
- // add it to the context
234
- embd.push_back(id);
235
- for (auto id : embd)
236
- {
237
  result = id;
238
- }
239
- // decrement remaining sampling budget
240
- --n_remain;
241
- }
242
- else
243
- {
244
- // some user input remains from prompt or interaction, forward it to processing
245
- while ((int)embd_inp.size() > n_consumed)
246
- {
247
- embd.push_back(embd_inp[n_consumed]);
248
- last_n_tokens.erase(last_n_tokens.begin());
249
- last_n_tokens.push_back(embd_inp[n_consumed]);
250
- processed_tokens.push_back(embd_inp[n_consumed]);
251
- ++n_consumed;
252
- if ((int)embd.size() >= params.n_batch)
253
- {
254
- break;
255
- }
256
- }
257
- }
258
- if (params.interactive && (int)embd_inp.size() <= n_consumed)
259
- {
260
- // check for reverse prompt
261
- if (params.antiprompt.size())
262
- {
263
- std::string last_output;
264
- for (auto id : last_n_tokens)
265
- {
266
- last_output += llama_token_to_str(ctx, id);
267
- }
268
- has_next_token = true;
269
- // Check if each of the reverse prompts appears at the end of the output.
270
- for (std::string &antiprompt : params.antiprompt)
271
- {
272
- if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos)
273
- {
274
  has_next_token = false;
 
 
275
  return result;
276
- }
277
  }
278
- }
279
- if (n_past > 0)
280
- {
281
- has_next_token = true;
282
- }
283
- }
284
 
285
- if (!embd.empty() && embd.back() == llama_token_eos()) {
286
- has_next_token = false;
287
  }
288
 
289
- if (params.interactive && n_remain <= 0 && params.n_predict != -1)
290
- {
291
- n_remain = params.n_predict;
292
- }
293
- has_next_token = n_remain != 0;
294
- return result;
295
- }
296
-
297
- std::string doCompletion()
298
- {
299
- llama_token token = nextToken();
300
- if (token == -1) {
301
- return "";
302
- }
303
- tokens_predicted.clear();
304
- tokens_predicted.push_back(token);
305
-
306
- // Avoid add the no show words to the response
307
- for (std::vector<llama_token> word_tokens : no_show_words)
308
- {
309
- size_t match_token = 1;
310
- if (tokens_predicted.front() == word_tokens.front())
311
- {
312
- bool execute_matching = true;
313
- if (tokens_predicted.size() > 1) { // if previus tokens had been tested
314
- for (size_t i = 1; i < word_tokens.size(); i++)
315
- {
316
- if (i >= tokens_predicted.size()) {
317
- match_token = i;
318
- break;
319
  }
320
- if (tokens_predicted[i] == word_tokens[i])
321
- {
322
- continue;
323
  }
324
- else
325
- {
326
- execute_matching = false;
327
- break;
 
 
 
 
328
  }
329
- }
330
- }
331
- while (execute_matching) {
332
- if (match_token == word_tokens.size()) {
333
- return "";
334
- }
335
- token = nextToken();
336
- tokens_predicted.push_back(token);
337
- if (token == word_tokens[match_token])
338
- { // the token follow the sequence
339
- match_token++;
340
- }
341
- else if (match_token < word_tokens.size())
342
- { // no complete all word sequence
343
- break;
344
- }
345
  }
346
- }
347
- }
348
- if(as_loop) {
349
- generated_text = "";
350
- }
351
- for (llama_token tkn : tokens_predicted)
352
- {
353
- generated_text += llama_token_to_str(ctx, tkn);
354
- }
355
- return generated_text;
356
- }
357
-
358
- std::vector<float> embedding(std::string content, int threads) {
359
- content.insert(0, 1, ' ');
360
- std::vector<llama_token> tokens = ::llama_tokenize(ctx, content, true);
361
- if (tokens.size() > 0)
362
- {
363
- if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads))
364
- {
365
- fprintf(stderr, "%s : failed to eval\n", __func__);
366
- std::vector<float> embeddings_;
367
- return embeddings_;
368
- }
369
  }
370
- const int n_embd = llama_n_embd(ctx);
371
- const auto embeddings = llama_get_embeddings(ctx);
372
- std::vector<float> embeddings_(embeddings, embeddings + n_embd);
373
- return embeddings_;
374
- }
375
- };
376
 
377
- using namespace httplib;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
- using json = nlohmann::json;
 
 
 
380
 
381
- void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
382
- {
383
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
384
- fprintf(stderr, "\n");
385
- fprintf(stderr, "options:\n");
386
- fprintf(stderr, " -h, --help show this help message and exit\n");
387
- fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
388
- fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
389
- fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
390
- fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
391
- fprintf(stderr, " --embedding enable embedding mode\n");
392
- fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
393
- if (llama_mlock_supported())
394
- {
395
- fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
396
- }
397
- if (llama_mmap_supported())
398
- {
399
- fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
400
- }
401
- #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
402
- fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
403
- fprintf(stderr, " number of layers to store in VRAM\n");
404
- fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
405
- fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
406
- fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
407
- fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
408
- #endif
409
- fprintf(stderr, " -m FNAME, --model FNAME\n");
410
- fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
411
- fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
412
- fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
413
- fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n");
414
- fprintf(stderr, " --port PORT port to listen (default 8080)\n");
415
- fprintf(stderr, "\n");
416
- }
417
 
418
- bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params &params)
419
- {
420
- gpt_params default_params;
421
- std::string arg;
422
- bool invalid_param = false;
423
-
424
- for (int i = 1; i < argc; i++)
425
- {
426
- arg = argv[i];
427
- if (arg == "--port")
428
- {
429
- if (++i >= argc)
430
- {
431
- invalid_param = true;
432
- break;
433
- }
434
- sparams.port = std::stoi(argv[i]);
435
- }
436
- else if (arg == "--host")
437
- {
438
- if (++i >= argc)
439
- {
440
- invalid_param = true;
441
- break;
442
- }
443
- sparams.hostname = argv[i];
444
- }
445
- else if (arg == "-s" || arg == "--seed")
446
- {
447
- #if defined(GGML_USE_CUBLAS)
448
- fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
449
- #endif
450
- if (++i >= argc)
451
- {
452
- invalid_param = true;
453
- break;
454
- }
455
- params.seed = std::stoi(argv[i]);
456
- }
457
- else if (arg == "-m" || arg == "--model")
458
- {
459
- if (++i >= argc)
460
- {
461
- invalid_param = true;
462
- break;
463
- }
464
- params.model = argv[i];
465
- }
466
- else if (arg == "-a" || arg == "--alias")
467
- {
468
- if (++i >= argc)
469
- {
470
- invalid_param = true;
471
- break;
472
- }
473
- params.model_alias = argv[i];
474
- }
475
- else if (arg == "--embedding")
476
- {
477
- params.embedding = true;
478
  }
479
- else if (arg == "-h" || arg == "--help")
480
- {
481
- server_print_usage(argc, argv, default_params);
482
- exit(0);
483
- }
484
- else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
485
- {
486
- if (++i >= argc)
487
- {
488
- invalid_param = true;
489
- break;
490
- }
491
- params.n_ctx = std::stoi(argv[i]);
 
 
 
492
  }
493
- else if (arg == "--memory-f32" || arg == "--memory_f32")
494
- {
495
- params.memory_f16 = false;
496
  }
497
- else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
498
- {
499
- if (++i >= argc)
500
- {
501
- invalid_param = true;
502
- break;
503
- }
504
  #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
505
- params.n_gpu_layers = std::stoi(argv[i]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  #else
507
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
508
- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
509
  #endif
510
- }
511
- else if (arg == "--tensor-split" || arg == "-ts")
512
- {
513
- if (++i >= argc)
514
- {
515
- invalid_param = true;
516
- break;
517
- }
518
  #ifdef GGML_USE_CUBLAS
519
- std::string arg_next = argv[i];
520
 
521
- // split string by , and /
522
- const std::regex regex{R"([,/]+)"};
523
- std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
524
- std::vector<std::string> split_arg{it, {}};
525
- GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
526
 
527
- for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
528
- {
529
- if (i < split_arg.size())
530
- {
531
- params.tensor_split[i] = std::stof(split_arg[i]);
 
 
 
 
 
 
532
  }
533
- else
534
  {
535
- params.tensor_split[i] = 0.0f;
536
- }
537
- }
538
  #else
539
- fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
540
  #endif // GGML_USE_CUBLAS
541
- }
542
- else if (arg == "--main-gpu" || arg == "-mg")
543
- {
544
- if (++i >= argc)
545
- {
546
- invalid_param = true;
547
- break;
548
- }
549
  #ifdef GGML_USE_CUBLAS
550
- params.main_gpu = std::stoi(argv[i]);
551
  #else
552
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
553
  #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  }
555
- else
556
- {
557
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
558
- server_print_usage(argc, argv, default_params);
559
- exit(1);
560
  }
561
- }
562
-
563
- if (invalid_param)
564
- {
565
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
566
- server_print_usage(argc, argv, default_params);
567
- exit(1);
568
- }
569
- return true;
570
  }
571
 
572
- bool parse_options_completion(json body, llama_server_context& llama, Response &res) {
573
- if (!body["threads"].is_null())
574
- {
575
- llama.params.n_threads = body["threads"].get<int>();
576
- }
577
- if (!body["n_predict"].is_null())
578
- {
579
- llama.params.n_predict = body["n_predict"].get<int>();
580
- }
581
- if (!body["top_k"].is_null())
582
- {
583
- llama.params.top_k = body["top_k"].get<int>();
584
- }
585
- if (!body["top_p"].is_null())
586
- {
587
- llama.params.top_p = body["top_p"].get<float>();
588
- }
589
- if (!body["temperature"].is_null())
590
- {
591
- llama.params.temp = body["temperature"].get<float>();
592
- }
593
- if (!body["batch_size"].is_null())
594
- {
595
- llama.params.n_batch = body["batch_size"].get<int>();
596
- }
597
- if (!body["n_keep"].is_null())
598
- {
599
- llama.params.n_keep = body["n_keep"].get<int>();
600
- }
601
- if (!body["as_loop"].is_null())
602
- {
603
- llama.as_loop = body["as_loop"].get<bool>();
604
- }
605
- if (!body["interactive"].is_null())
606
- {
607
- llama.params.interactive = body["interactive"].get<bool>();
608
- }
609
- if (!body["prompt"].is_null())
610
- {
611
- llama.params.prompt = body["prompt"].get<std::string>();
612
- }
613
- else
614
- {
615
- json data = {
616
- {"status", "error"},
617
- {"reason", "You need to pass the prompt"}};
618
- res.set_content(data.dump(), "application/json");
619
- res.status = 400;
620
- return false;
621
- }
622
- if (!body["stop"].is_null())
623
- {
624
- std::vector<std::string> stop_words = body["stop"].get<std::vector<std::string>>();
625
- for (std::string stop_word : stop_words)
626
- {
627
- llama.params.antiprompt.push_back(stop_word);
628
- llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  }
630
- }
631
- if (!body["exclude"].is_null())
632
- {
633
- std::vector<std::string> no_show_words = body["exclude"].get<std::vector<std::string>>();
634
- for (std::string no_show : no_show_words)
635
- {
636
- llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false));
 
 
637
  }
638
- }
639
- return true;
640
  }
641
 
642
- int main(int argc, char **argv)
643
- {
644
- // own arguments required by this example
645
- gpt_params params;
646
- server_params sparams;
647
-
648
- // struct that contains llama context and inference
649
- llama_server_context llama;
650
- params.model = "ggml-model.bin";
651
-
652
- if (server_params_parse(argc, argv, sparams, params) == false)
653
- {
654
- return 1;
655
- }
656
-
657
- if (params.seed <= 0)
658
- {
659
- params.seed = time(NULL);
660
- }
661
-
662
- fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
663
-
664
- // load the model
665
- if (!llama.loadModel(params))
666
- {
667
- return 1;
668
- }
669
-
670
- Server svr;
671
-
672
- svr.Get("/", [](const Request &, Response &res)
673
- { res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
674
-
675
- svr.Post("/completion", [&llama](const Request &req, Response &res)
676
- {
677
- if(llama.params.embedding) {
678
- json data = {
679
- {"status", "error"},
680
- {"reason", "To use completion function disable embedding mode"}};
681
- res.set_content(data.dump(), "application/json");
682
- res.status = 400;
683
- return;
684
- }
685
-
686
- llama.rewind();
687
-
688
- if(parse_options_completion(json::parse(req.body), llama, res) == false){
689
- return;
690
- }
691
-
692
- if (!llama.loadPrompt())
693
- {
694
- json data = {
695
- {"status", "error"},
696
- {"reason", "Context too long, please be more specific"}};
697
- res.set_content(data.dump(), "application/json");
698
- res.status = 400;
699
- return;
700
- }
701
-
702
- llama.beginCompletion();
703
- if(llama.as_loop) {
704
- json data = {
705
- {"status", "done" } };
706
- return res.set_content(data.dump(), "application/json");
707
- } else {
708
- // loop inference until finish completion
709
- while (llama.has_next_token)
710
- {
711
- llama.doCompletion();
712
- }
713
- try
714
- {
715
- json data = {
716
- {"model", llama.params.model_alias },
717
- {"content", llama.generated_text },
718
- {"tokens_predicted", llama.num_tokens_predicted}};
719
- return res.set_content(data.dump(), "application/json");
720
- }
721
- catch (const json::exception &e)
722
- {
723
- // Some tokens have bad UTF-8 strings, the json parser is very sensitive
724
- json data = {
725
- {"content", "Bad encoding token"},
726
- {"tokens_predicted", 0}};
727
- return res.set_content(data.dump(), "application/json");
728
- }
729
- } });
730
-
731
- svr.Post("/tokenize", [&llama](const Request &req, Response &res)
732
- {
733
- json body = json::parse(req.body);
734
- json data = {
735
- {"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
736
- return res.set_content(data.dump(), "application/json");
737
- });
738
 
739
- svr.Post("/embedding", [&llama](const Request &req, Response &res)
740
- {
741
- if(!llama.params.embedding) {
742
- std::vector<float> empty;
743
- json data = {
744
- {"embedding", empty}};
745
- fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
746
- return res.set_content(data.dump(), "application/json");
747
- }
748
- json body = json::parse(req.body);
749
- std::string content = body["content"].get<std::string>();
750
- int threads = body["threads"].get<int>();
751
- json data = {
752
- {"embedding", llama.embedding(content, threads) } };
753
- return res.set_content(data.dump(), "application/json");
754
- });
755
 
756
- svr.Get("/next-token", [&llama](const Request &req, Response &res)
757
- {
758
- if(llama.params.embedding) {
759
- res.set_content("{}", "application/json");
760
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  }
762
- std::string result = "";
763
- if (req.has_param("stop")) {
764
- llama.has_next_token = false;
765
- } else {
766
- result = llama.doCompletion(); // inference next token
767
  }
768
- try {
769
- json data = {
770
- {"content", result },
771
- {"stop", !llama.has_next_token }};
772
- return res.set_content(data.dump(), "application/json");
773
- } catch (const json::exception &e) {
774
- // Some tokens have bad UTF-8 strings, the json parser is very sensitive
775
- json data = {
776
- {"content", "" },
777
- {"stop", !llama.has_next_token }};
778
- return res.set_content(data.dump(), "application/json");
779
  }
780
- });
781
 
782
- fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
783
 
784
- if(params.embedding) {
785
- fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
786
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
 
788
- // change hostname and port
789
- svr.listen(sparams.hostname, sparams.port);
790
  }
 
 
 
1
  #include "common.h"
2
  #include "llama.h"
3
+ #include "build-info.h"
4
 
5
+ // single thread
6
+ #define CPPHTTPLIB_THREAD_POOL_COUNT 1
7
+ #ifndef NDEBUG
8
+ // crash the server in debug mode, otherwise send an http 500 error
9
+ #define CPPHTTPLIB_NO_EXCEPTIONS 1
10
+ #endif
11
+
12
+ #include "httplib.h"
13
+ #include "json.hpp"
14
+
15
+ #ifndef SERVER_VERBOSE
16
+ #define SERVER_VERBOSE 1
17
+ #endif
18
+
19
+ using namespace httplib;
20
+ using json = nlohmann::json;
21
+
22
+ struct server_params {
23
+ std::string hostname = "127.0.0.1";
24
+ int32_t port = 8080;
25
+ int32_t read_timeout = 600;
26
+ int32_t write_timeout = 600;
27
  };
28
 
29
+ static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
30
+ size_t i;
31
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
32
+ return i;
33
+ }
34
+
35
+ enum stop_type {
36
+ STOP_FULL,
37
+ STOP_PARTIAL,
38
+ };
39
+
40
+ static bool ends_with(const std::string & str, const std::string & suffix) {
41
+ return str.size() >= suffix.size() &&
42
+ 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
43
+ }
44
+
45
+ static size_t find_partial_stop_string(const std::string & stop,
46
+ const std::string & text) {
47
+ if (!text.empty() && !stop.empty()) {
48
+ const char text_last_char = text.back();
49
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
50
+ if (stop[char_index] == text_last_char) {
51
+ const std::string current_partial = stop.substr(0, char_index + 1);
52
+ if (ends_with(text, current_partial)) {
53
+ return text.size() - char_index - 1;
54
+ }
55
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  }
 
 
 
 
 
57
  }
58
+ return std::string::npos;
59
+ }
60
+
61
+ template<class Iter>
62
+ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
63
+ std::string ret;
64
+ for (; begin != end; ++begin) {
65
+ ret += llama_token_to_str(ctx, *begin);
66
  }
67
+ return ret;
68
+ }
69
+
70
+ static void server_log(const char * level, const char * function, int line,
71
+ const char * message, const nlohmann::ordered_json & extra) {
72
+ nlohmann::ordered_json log {
73
+ { "timestamp", time(nullptr) },
74
+ { "level", level },
75
+ { "function", function },
76
+ { "line", line },
77
+ { "message", message },
78
+ };
79
+
80
+ if (!extra.empty()) {
81
+ log.merge_patch(extra);
82
  }
83
+
84
+ const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
85
+ fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
86
+ fflush(stdout);
87
+ }
88
+
89
+ static bool server_verbose = false;
90
+
91
+ #if SERVER_VERBOSE != 1
92
+ # define LOG_VERBOSE(MSG, ...)
93
+ #else
94
+ # define LOG_VERBOSE(MSG, ...) \
95
+ do { \
96
+ if (server_verbose) { \
97
+ server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
98
+ } \
99
+ } while(0)
100
+ #endif
101
+
102
+ #define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
103
+ #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
104
+ #define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
105
+
106
+ struct llama_server_context {
107
+ bool stream = false;
108
+ bool has_next_token = false;
109
+ std::string generated_text;
110
+
111
+ size_t num_tokens_predicted = 0;
112
+ size_t n_past = 0;
113
+ size_t n_remain = 0;
114
+
115
+ std::vector<llama_token> embd;
116
+ std::vector<llama_token> last_n_tokens;
117
+
118
+ llama_context * ctx = nullptr;
119
+ gpt_params params;
120
+
121
+ bool truncated = false;
122
+ bool stopped_eos = false;
123
+ bool stopped_word = false;
124
+ bool stopped_limit = false;
125
+ std::string stopping_word;
126
+ int32_t multibyte_pending = 0;
127
+
128
+ ~llama_server_context() {
129
+ if (ctx) {
130
+ llama_free(ctx);
131
+ ctx = nullptr;
132
  }
133
+ }
134
+
135
+ void rewind() {
136
+ params.antiprompt.clear();
137
+ num_tokens_predicted = 0;
138
+ generated_text = "";
139
+ generated_text.reserve(params.n_ctx);
140
+ truncated = false;
141
+ stopped_eos = false;
142
+ stopped_word = false;
143
+ stopped_limit = false;
144
+ stopping_word = "";
145
+ multibyte_pending = 0;
146
+
147
+ n_remain = 0;
148
+ n_past = 0;
149
+ }
150
+
151
+ bool loadModel(const gpt_params & params_) {
152
+ params = params_;
153
+ ctx = llama_init_from_gpt_params(params);
154
+ if (ctx == nullptr) {
155
+ LOG_ERROR("unable to load model", { { "model", params_.model } });
156
+ return false;
157
  }
158
+
159
+ last_n_tokens.resize(params.n_ctx);
160
+ std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
161
+ return true;
162
  }
163
+
164
+ void loadPrompt() {
165
+ params.prompt.insert(0, 1, ' '); // always add a first space
166
+ std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
167
+
168
+ if (params.n_keep < 0) {
169
+ params.n_keep = (int)prompt_tokens.size();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  }
171
+ params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
172
+
173
+ // if input prompt is too big, truncate like normal
174
+ if (prompt_tokens.size() >= (size_t)params.n_ctx) {
175
+ const int n_left = (params.n_ctx - params.n_keep) / 2;
176
+ std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
177
+ const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left;
178
+ new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
179
+ std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
180
+
181
+ LOG_VERBOSE("input truncated", {
182
+ { "n_ctx", params.n_ctx },
183
+ { "n_keep", params.n_keep },
184
+ { "n_left", n_left },
185
+ { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
186
+ });
187
 
188
+ truncated = true;
189
+ prompt_tokens = new_tokens;
190
+ } else {
191
+ const size_t ps = prompt_tokens.size();
192
+ std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
193
+ std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
194
  }
195
 
196
+ // compare the evaluated prompt with the new prompt
197
+ n_past = common_part(embd, prompt_tokens);
198
+ embd = prompt_tokens;
199
+ if (n_past == prompt_tokens.size()) {
200
+ // we have to evaluate at least 1 token to generate logits.
201
+ n_past--;
 
 
 
 
 
 
 
 
202
  }
203
 
204
+ LOG_VERBOSE("prompt ingested", {
205
+ { "n_past", n_past },
206
+ { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) },
207
+ { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
208
+ });
209
+
210
+ has_next_token = true;
211
+ }
212
+
213
+ void beginCompletion() {
214
+ // number of tokens to keep when resetting context
215
+ n_remain = params.n_predict;
216
+ llama_set_rng_seed(ctx, params.seed);
217
+ }
218
+
219
+ llama_token nextToken() {
220
+ llama_token result = -1;
221
+
222
+ if (embd.size() >= (size_t)params.n_ctx) {
223
+ // Reset context
224
+ const int n_left = (params.n_ctx - params.n_keep) / 2;
225
+
226
+ std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
227
+ new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
228
+ embd = new_tokens;
229
+ n_past = params.n_keep;
230
+ truncated = true;
231
+ LOG_VERBOSE("input truncated", {
232
+ { "n_ctx", params.n_ctx },
233
+ { "n_keep", params.n_keep },
234
+ { "n_left", n_left },
235
+ { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
236
+ });
237
  }
238
+
239
+ while (n_past < embd.size()) {
240
+ int n_eval = (int)embd.size() - n_past;
241
+ if (n_eval > params.n_batch) {
242
+ n_eval = params.n_batch;
243
+ }
244
+ if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) {
245
+ LOG_ERROR("failed to eval", {
246
+ { "n_eval", n_eval },
247
+ { "n_past", n_past },
248
+ { "n_threads", params.n_threads },
249
+ { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
250
+ });
251
+ has_next_token = false;
252
+ return result;
253
+ }
254
+ n_past += n_eval;
 
 
 
 
 
 
 
255
  }
256
+
257
+ // out of user input, sample next token
258
+ const float temp = params.temp;
259
+ const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
260
+ const float top_p = params.top_p;
261
+ const float tfs_z = params.tfs_z;
262
+ const float typical_p = params.typical_p;
263
+ const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
264
+ const float repeat_penalty = params.repeat_penalty;
265
+ const float alpha_presence = params.presence_penalty;
266
+ const float alpha_frequency = params.frequency_penalty;
267
+ const int mirostat = params.mirostat;
268
+ const float mirostat_tau = params.mirostat_tau;
269
+ const float mirostat_eta = params.mirostat_eta;
270
+ const bool penalize_nl = params.penalize_nl;
271
+ llama_token id = 0;
272
+
273
  {
274
+ auto * logits = llama_get_logits(ctx);
275
+ auto n_vocab = llama_n_vocab(ctx);
276
+
277
+ // Apply params.logit_bias map
278
+ for (const auto & it : params.logit_bias) {
279
+ logits[it.first] += it.second;
280
+ }
281
+
282
+ std::vector<llama_token_data> candidates;
283
+ candidates.reserve(n_vocab);
284
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
285
+ candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
286
+ }
287
+
288
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
289
+
290
+ // Apply penalties
291
+ float nl_logit = logits[llama_token_nl()];
292
+ auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
293
+ llama_sample_repetition_penalty(ctx, &candidates_p,
294
+ last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
295
+ last_n_repeat, repeat_penalty);
296
+ llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
297
+ last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
298
+ last_n_repeat, alpha_frequency, alpha_presence);
299
+ if (!penalize_nl) {
300
+ logits[llama_token_nl()] = nl_logit;
301
+ }
302
+
303
+ if (temp <= 0) {
304
+ // Greedy sampling
305
+ id = llama_sample_token_greedy(ctx, &candidates_p);
306
+ } else {
307
+ if (mirostat == 1) {
308
+ static float mirostat_mu = 2.0f * mirostat_tau;
309
+ const int mirostat_m = 100;
310
+ llama_sample_temperature(ctx, &candidates_p, temp);
311
+ id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
312
+ } else if (mirostat == 2) {
313
+ static float mirostat_mu = 2.0f * mirostat_tau;
314
+ llama_sample_temperature(ctx, &candidates_p, temp);
315
+ id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
316
+ } else {
317
+ // Temperature sampling
318
+ llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
319
+ llama_sample_typical(ctx, &candidates_p, typical_p, 1);
320
+ llama_sample_top_p(ctx, &candidates_p, top_p, 1);
321
+ llama_sample_top_k(ctx, &candidates_p, top_k, 1);
322
+ llama_sample_temperature(ctx, &candidates_p, temp);
323
+ id = llama_sample_token(ctx, &candidates_p);
324
+ }
325
+ }
326
+ last_n_tokens.erase(last_n_tokens.begin());
327
+ last_n_tokens.push_back(id);
328
+ num_tokens_predicted++;
329
  }
 
330
 
331
+ // add it to the context
332
+ embd.push_back(id);
 
 
333
  result = id;
334
+ // decrement remaining sampling budget
335
+ --n_remain;
336
+
337
+ if (!embd.empty() && embd.back() == llama_token_eos()) {
338
+ //stopping_word = llama_token_to_str(ctx, embd.back());
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  has_next_token = false;
340
+ stopped_eos = true;
341
+ LOG_VERBOSE("eos token found", {});
342
  return result;
 
343
  }
 
 
 
 
 
 
344
 
345
+ has_next_token = params.n_predict == -1 || n_remain != 0;
346
+ return result;
347
  }
348
 
349
+ size_t findStoppingStrings(const std::string & text, const size_t last_token_size,
350
+ const stop_type type) {
351
+ size_t stop_pos = std::string::npos;
352
+ for (const std::string & word : params.antiprompt) {
353
+ size_t pos;
354
+ if (type == STOP_FULL) {
355
+ const size_t tmp = word.size() + last_token_size;
356
+ const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
357
+ pos = text.find(word, from_pos);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  }
359
+ else {
360
+ pos = find_partial_stop_string(word, text);
 
361
  }
362
+ if (pos != std::string::npos &&
363
+ (stop_pos == std::string::npos || pos < stop_pos)) {
364
+ if (type == STOP_FULL) {
365
+ stopping_word = word;
366
+ stopped_word = true;
367
+ has_next_token = false;
368
+ }
369
+ stop_pos = pos;
370
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  }
372
+ return stop_pos;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  }
 
 
 
 
 
 
374
 
375
+ std::string doCompletion() {
376
+ const llama_token token = nextToken();
377
+
378
+ const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
379
+ generated_text += token_text;
380
+
381
+ if (multibyte_pending > 0) {
382
+ multibyte_pending -= token_text.size();
383
+ } else if (token_text.size() == 1) {
384
+ const char c = token_text[0];
385
+ // 2-byte characters: 110xxxxx 10xxxxxx
386
+ if ((c & 0xE0) == 0xC0) {
387
+ multibyte_pending = 1;
388
+ // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
389
+ } else if ((c & 0xF0) == 0xE0) {
390
+ multibyte_pending = 2;
391
+ // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
392
+ } else if ((c & 0xF8) == 0xF0) {
393
+ multibyte_pending = 3;
394
+ } else {
395
+ multibyte_pending = 0;
396
+ }
397
+ }
398
 
399
+ if (multibyte_pending > 0 && !has_next_token) {
400
+ has_next_token = true;
401
+ n_remain++;
402
+ }
403
 
404
+ if (!has_next_token && n_remain == 0) {
405
+ stopped_limit = true;
406
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
+ LOG_VERBOSE("next token", {
409
+ { "token", token },
410
+ { "token_text", llama_token_to_str(ctx, token) },
411
+ { "has_next_token", has_next_token },
412
+ { "n_remain", n_remain },
413
+ { "num_tokens_predicted", num_tokens_predicted },
414
+ { "stopped_eos", stopped_eos },
415
+ { "stopped_word", stopped_word },
416
+ { "stopped_limit", stopped_limit },
417
+ { "stopping_word", stopping_word },
418
+ });
419
+
420
+ return token_text;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  }
422
+ };
423
+
424
+ static void server_print_usage(const char * argv0, const gpt_params & params,
425
+ const server_params & sparams) {
426
+ fprintf(stderr, "usage: %s [options]\n", argv0);
427
+ fprintf(stderr, "\n");
428
+ fprintf(stderr, "options:\n");
429
+ fprintf(stderr, " -h, --help show this help message and exit\n");
430
+ fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
431
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
432
+ fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
433
+ fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
434
+ fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
435
+ fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
436
+ if (llama_mlock_supported()) {
437
+ fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
438
  }
439
+ if (llama_mmap_supported()) {
440
+ fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
 
441
  }
 
 
 
 
 
 
 
442
  #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
443
+ fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
444
+ fprintf(stderr, " number of layers to store in VRAM\n");
445
+ fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
446
+ fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
447
+ fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
448
+ fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
449
+ fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
450
+ #endif
451
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
452
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
453
+ fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
454
+ fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
455
+ fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
456
+ fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
457
+ fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
458
+ fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
459
+ fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
460
+ fprintf(stderr, "\n");
461
+ }
462
+
463
+ static void server_params_parse(int argc, char ** argv, server_params & sparams,
464
+ gpt_params & params) {
465
+ gpt_params default_params;
466
+ server_params default_sparams;
467
+ std::string arg;
468
+ bool invalid_param = false;
469
+
470
+ for (int i = 1; i < argc; i++) {
471
+ arg = argv[i];
472
+ if (arg == "--port") {
473
+ if (++i >= argc) {
474
+ invalid_param = true;
475
+ break;
476
+ }
477
+ sparams.port = std::stoi(argv[i]);
478
+ } else if (arg == "--host") {
479
+ if (++i >= argc) {
480
+ invalid_param = true;
481
+ break;
482
+ }
483
+ sparams.hostname = argv[i];
484
+ } else if (arg == "--timeout" || arg == "-to") {
485
+ if (++i >= argc) {
486
+ invalid_param = true;
487
+ break;
488
+ }
489
+ sparams.read_timeout = std::stoi(argv[i]);
490
+ sparams.write_timeout = std::stoi(argv[i]);
491
+ } else if (arg == "-m" || arg == "--model") {
492
+ if (++i >= argc) {
493
+ invalid_param = true;
494
+ break;
495
+ }
496
+ params.model = argv[i];
497
+ } else if (arg == "-a" || arg == "--alias") {
498
+ if (++i >= argc) {
499
+ invalid_param = true;
500
+ break;
501
+ }
502
+ params.model_alias = argv[i];
503
+ } else if (arg == "-h" || arg == "--help") {
504
+ server_print_usage(argv[0], default_params, default_sparams);
505
+ exit(0);
506
+ } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
507
+ if (++i >= argc) {
508
+ invalid_param = true;
509
+ break;
510
+ }
511
+ params.n_ctx = std::stoi(argv[i]);
512
+ } else if (arg == "--memory-f32" || arg == "--memory_f32") {
513
+ params.memory_f16 = false;
514
+ } else if (arg == "--threads" || arg == "-t") {
515
+ if (++i >= argc) {
516
+ invalid_param = true;
517
+ break;
518
+ }
519
+ params.n_threads = std::stoi(argv[i]);
520
+ } else if (arg == "-b" || arg == "--batch-size") {
521
+ if (++i >= argc) {
522
+ invalid_param = true;
523
+ break;
524
+ }
525
+ params.n_batch = std::stoi(argv[i]);
526
+ params.n_batch = std::min(512, params.n_batch);
527
+ } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
528
+ if (++i >= argc) {
529
+ invalid_param = true;
530
+ break;
531
+ }
532
+ #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
533
+ params.n_gpu_layers = std::stoi(argv[i]);
534
  #else
535
+ LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
536
+ "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } });
537
  #endif
538
+ }
539
+ else if (arg == "--tensor-split" || arg == "-ts") {
540
+ if (++i >= argc) {
541
+ invalid_param = true;
542
+ break;
543
+ }
 
 
544
  #ifdef GGML_USE_CUBLAS
545
+ std::string arg_next = argv[i];
546
 
547
+ // split string by , and /
548
+ const std::regex regex{ R"([,/]+)" };
549
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
550
+ std::vector<std::string> split_arg{ it, {} };
551
+ GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
552
 
553
+ for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
554
+ if (i_device < split_arg.size()) {
555
+ params.tensor_split[i_device] = std::stof(split_arg[i_device]);
556
+ }
557
+ else {
558
+ params.tensor_split[i_device] = 0.0f;
559
+ }
560
+ }
561
+ #else
562
+ LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
563
+ #endif // GGML_USE_CUBLAS
564
  }
565
+ else if (arg == "--low-vram" || arg == "-lv")
566
  {
567
+ #ifdef GGML_USE_CUBLAS
568
+ params.low_vram = true;
 
569
  #else
570
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
571
  #endif // GGML_USE_CUBLAS
572
+ }
573
+ else if (arg == "--main-gpu" || arg == "-mg") {
574
+ if (++i >= argc) {
575
+ invalid_param = true;
576
+ break;
577
+ }
 
 
578
  #ifdef GGML_USE_CUBLAS
579
+ params.main_gpu = std::stoi(argv[i]);
580
  #else
581
+ LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
582
  #endif
583
+ } else if (arg == "--lora") {
584
+ if (++i >= argc) {
585
+ invalid_param = true;
586
+ break;
587
+ }
588
+ params.lora_adapter = argv[i];
589
+ params.use_mmap = false;
590
+ } else if (arg == "--lora-base") {
591
+ if (++i >= argc) {
592
+ invalid_param = true;
593
+ break;
594
+ }
595
+ params.lora_base = argv[i];
596
+ } else if (arg == "-v" || arg == "--verbose") {
597
+ #if SERVER_VERBOSE != 1
598
+ LOG_WARNING("server.cpp is not built with verbose logging.", {});
599
+ #else
600
+ server_verbose = true;
601
+ #endif
602
+ } else if (arg == "--mlock") {
603
+ params.use_mlock = true;
604
+ } else if (arg == "--no-mmap") {
605
+ params.use_mmap = false;
606
+ } else {
607
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
608
+ server_print_usage(argv[0], default_params, default_sparams);
609
+ exit(1);
610
+ }
611
  }
612
+
613
+ if (invalid_param) {
614
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
615
+ server_print_usage(argv[0], default_params, default_sparams);
616
+ exit(1);
617
  }
 
 
 
 
 
 
 
 
 
618
  }
619
 
620
+ static json format_generation_settings(llama_server_context & llama) {
621
+ const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
622
+ const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
623
+ eos_bias->second < 0.0f && std::isinf(eos_bias->second);
624
+
625
+ return json {
626
+ { "seed", llama.params.seed },
627
+ { "temp", llama.params.temp },
628
+ { "top_k", llama.params.top_k },
629
+ { "top_p", llama.params.top_p },
630
+ { "tfs_z", llama.params.tfs_z },
631
+ { "typical_p", llama.params.typical_p },
632
+ { "repeat_last_n", llama.params.repeat_last_n },
633
+ { "repeat_penalty", llama.params.repeat_penalty },
634
+ { "presence_penalty", llama.params.presence_penalty },
635
+ { "frequency_penalty", llama.params.frequency_penalty },
636
+ { "mirostat", llama.params.mirostat },
637
+ { "mirostat_tau", llama.params.mirostat_tau },
638
+ { "mirostat_eta", llama.params.mirostat_eta },
639
+ { "penalize_nl", llama.params.penalize_nl },
640
+ { "stop", llama.params.antiprompt },
641
+ { "n_predict", llama.params.n_predict },
642
+ { "n_keep", llama.params.n_keep },
643
+ { "ignore_eos", ignore_eos },
644
+ { "stream", llama.stream },
645
+ { "logit_bias", llama.params.logit_bias },
646
+ };
647
+ }
648
+
649
+ static json format_final_response(llama_server_context & llama, const std::string & content) {
650
+ return json {
651
+ { "content", content },
652
+ { "stop", true },
653
+ { "model", llama.params.model_alias },
654
+ { "tokens_predicted", llama.num_tokens_predicted },
655
+ { "generation_settings", format_generation_settings(llama) },
656
+ { "prompt", llama.params.prompt },
657
+ { "truncated", llama.truncated },
658
+ { "stopped_eos", llama.stopped_eos },
659
+ { "stopped_word", llama.stopped_word },
660
+ { "stopped_limit", llama.stopped_limit },
661
+ { "stopping_word", llama.stopping_word },
662
+ };
663
+ }
664
+
665
+ static json format_partial_response(const std::string & content) {
666
+ return json {
667
+ { "content", content },
668
+ { "stop", false },
669
+ };
670
+ }
671
+
672
+ static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
673
+ return json {
674
+ { "tokens", tokens }
675
+ };
676
+ }
677
+
678
+ static void parse_options_completion(const json & body, llama_server_context & llama) {
679
+ gpt_params default_params;
680
+
681
+ llama.stream = body.value("stream", false);
682
+ llama.params.n_predict = body.value("n_predict", default_params.n_predict);
683
+ llama.params.top_k = body.value("top_k", default_params.top_k);
684
+ llama.params.top_p = body.value("top_p", default_params.top_p);
685
+ llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
686
+ llama.params.typical_p = body.value("typical_p", default_params.typical_p);
687
+ llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
688
+ llama.params.temp = body.value("temperature", default_params.temp);
689
+ llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
690
+ llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
691
+ llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
692
+ llama.params.mirostat = body.value("mirostat", default_params.mirostat);
693
+ llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
694
+ llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
695
+ llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
696
+ llama.params.n_keep = body.value("n_keep", default_params.n_keep);
697
+ llama.params.seed = body.value("seed", default_params.seed);
698
+ llama.params.prompt = body.value("prompt", default_params.prompt);
699
+
700
+ llama.params.logit_bias.clear();
701
+ if (body.value("ignore_eos", false)) {
702
+ llama.params.logit_bias[llama_token_eos()] = -INFINITY;
703
+ }
704
+
705
+ const auto & logit_bias = body.find("logit_bias");
706
+ if (logit_bias != body.end() && logit_bias->is_array()) {
707
+ const int n_vocab = llama_n_vocab(llama.ctx);
708
+ for (const auto & el : *logit_bias) {
709
+ if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
710
+ llama_token tok = el[0].get<llama_token>();
711
+ if (tok >= 0 && tok < n_vocab) {
712
+ if (el[1].is_number()) {
713
+ llama.params.logit_bias[tok] = el[1].get<float>();
714
+ } else if (el[1].is_boolean() && !el[1].get<bool>()) {
715
+ llama.params.logit_bias[tok] = -INFINITY;
716
+ }
717
+ }
718
+ }
719
+ }
720
  }
721
+
722
+ llama.params.antiprompt.clear();
723
+ const auto & stop = body.find("stop");
724
+ if (stop != body.end() && stop->is_array()) {
725
+ for (const auto & word : *stop) {
726
+ if (!word.empty()) {
727
+ llama.params.antiprompt.push_back(word);
728
+ }
729
+ }
730
  }
731
+
732
+ LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
733
  }
734
 
735
+ static void log_server_request(const Request & req, const Response & res) {
736
+ LOG_INFO("request", {
737
+ { "remote_addr", req.remote_addr },
738
+ { "remote_port", req.remote_port },
739
+ { "status", res.status },
740
+ { "path", req.path },
741
+ { "request", req.body },
742
+ { "response", res.body },
743
+ });
744
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
745
 
746
+ int main(int argc, char ** argv) {
747
+ // own arguments required by this example
748
+ gpt_params params;
749
+ server_params sparams;
 
 
 
 
 
 
 
 
 
 
 
 
750
 
751
+ // struct that contains llama context and inference
752
+ llama_server_context llama;
753
+
754
+ server_params_parse(argc, argv, sparams, params);
755
+
756
+ if (params.model_alias == "unknown") {
757
+ params.model_alias = params.model;
758
+ }
759
+
760
+ llama_init_backend();
761
+
762
+ LOG_INFO("build info", {
763
+ { "build", BUILD_NUMBER },
764
+ { "commit", BUILD_COMMIT }
765
+ });
766
+ LOG_INFO("system info", {
767
+ { "n_threads", params.n_threads },
768
+ { "total_threads", std::thread::hardware_concurrency() },
769
+ { "system_info", llama_print_system_info() },
770
+ });
771
+
772
+ // load the model
773
+ if (!llama.loadModel(params)) {
774
+ return 1;
775
+ }
776
+
777
+ Server svr;
778
+
779
+ svr.set_default_headers({
780
+ { "Access-Control-Allow-Origin", "*" },
781
+ { "Access-Control-Allow-Headers", "content-type" }
782
+ });
783
+
784
+ svr.Get("/", [](const Request &, Response & res) {
785
+ res.set_content("<h1>llama.cpp server works</h1>", "text/html");
786
+ });
787
+
788
+ svr.Post("/completion", [&llama](const Request & req, Response & res) {
789
+ llama.rewind();
790
+ llama_reset_timings(llama.ctx);
791
+
792
+ parse_options_completion(json::parse(req.body), llama);
793
+
794
+ llama.loadPrompt();
795
+ llama.beginCompletion();
796
+
797
+ if (!llama.stream) {
798
+ size_t stop_pos = std::string::npos;
799
+
800
+ while (llama.has_next_token) {
801
+ const std::string token_text = llama.doCompletion();
802
+
803
+ stop_pos = llama.findStoppingStrings(llama.generated_text,
804
+ token_text.size(), STOP_FULL);
805
  }
806
+
807
+ if (stop_pos == std::string::npos) {
808
+ stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
 
 
809
  }
810
+ if (stop_pos != std::string::npos) {
811
+ llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
812
+ llama.generated_text.end());
 
 
 
 
 
 
 
 
813
  }
 
814
 
815
+ const json data = format_final_response(llama, llama.generated_text);
816
+
817
+ llama_print_timings(llama.ctx);
818
+
819
+ res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
820
+ "application/json");
821
+ } else {
822
+ const auto chunked_content_provider = [&](size_t, DataSink & sink) {
823
+ size_t sent_count = 0;
824
+
825
+ while (llama.has_next_token) {
826
+ const std::string token_text = llama.doCompletion();
827
+ if (llama.multibyte_pending > 0) {
828
+ continue;
829
+ }
830
+
831
+ size_t pos = std::min(sent_count, llama.generated_text.size());
832
+
833
+ const std::string str_test = llama.generated_text.substr(pos);
834
+ size_t stop_pos =
835
+ llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
836
+ if (stop_pos != std::string::npos) {
837
+ llama.generated_text.erase(
838
+ llama.generated_text.begin() + pos + stop_pos,
839
+ llama.generated_text.end());
840
+ pos = std::min(sent_count, llama.generated_text.size());
841
+ } else {
842
+ stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
843
+ STOP_PARTIAL);
844
+ }
845
+
846
+ const std::string to_send = llama.generated_text.substr(pos, stop_pos);
847
+ sent_count += to_send.size();
848
+
849
+ const json data = llama.has_next_token
850
+ ? format_partial_response(to_send)
851
+ // Generation is done, send extra information.
852
+ : format_final_response(llama, to_send);
853
+
854
+ const std::string str =
855
+ "data: " +
856
+ data.dump(-1, ' ', false, json::error_handler_t::replace) +
857
+ "\n\n";
858
+
859
+ LOG_VERBOSE("data stream", {
860
+ { "to_send", str }
861
+ });
862
+
863
+ if (!sink.write(str.data(), str.size())) {
864
+ LOG_VERBOSE("stream closed", {});
865
+ llama_print_timings(llama.ctx);
866
+ return false;
867
+ }
868
+ }
869
 
870
+ llama_print_timings(llama.ctx);
871
+ sink.done();
872
+ return true;
873
+ };
874
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
875
+ }
876
+ });
877
+
878
+ svr.Options(R"(/.*)", [](const Request &, Response & res) {
879
+ return res.set_content("", "application/json");
880
+ });
881
+
882
+ svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
883
+ const json body = json::parse(req.body);
884
+ const std::string content = body["content"].get<std::string>();
885
+ const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
886
+ const json data = format_tokenizer_response(tokens);
887
+ return res.set_content(data.dump(), "application/json");
888
+ });
889
+
890
+ svr.set_logger(log_server_request);
891
+
892
+ svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
893
+ const auto * fmt = "500 Internal Server Error\n%s";
894
+ char buf[BUFSIZ];
895
+ try {
896
+ std::rethrow_exception(std::move(ep));
897
+ } catch (std::exception & e) {
898
+ snprintf(buf, sizeof(buf), fmt, e.what());
899
+ } catch (...) {
900
+ snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
901
+ }
902
+ res.set_content(buf, "text/plain");
903
+ res.status = 500;
904
+ });
905
+
906
+ // set timeouts and change hostname and port
907
+ svr.set_read_timeout(sparams.read_timeout);
908
+ svr.set_write_timeout(sparams.write_timeout);
909
+
910
+ if (!svr.bind_to_port(sparams.hostname, sparams.port)) {
911
+ LOG_ERROR("couldn't bind to server socket", {
912
+ { "hostname", sparams.hostname },
913
+ { "port", sparams.port },
914
+ });
915
+ return 1;
916
+ }
917
+
918
+ LOG_INFO("HTTP server listening", {
919
+ { "hostname", sparams.hostname },
920
+ { "port", sparams.port },
921
+ });
922
+
923
+ if (!svr.listen_after_bind()) {
924
+ return 1;
925
+ }
926
 
927
+ return 0;
 
928
  }
examples/simple/CMakeLists.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ set(TARGET simple)
2
+ add_executable(${TARGET} simple.cpp)
3
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ if(TARGET BUILD_INFO)
6
+ add_dependencies(${TARGET} BUILD_INFO)
7
+ endif()
examples/simple/simple.cpp ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _GNU_SOURCE
2
+ #define _GNU_SOURCE
3
+ #endif
4
+
5
+ #include "common.h"
6
+ #include "llama.h"
7
+ #include "build-info.h"
8
+
9
+ #include <cassert>
10
+ #include <cinttypes>
11
+ #include <cmath>
12
+ #include <cstdio>
13
+ #include <cstring>
14
+ #include <ctime>
15
+ #include <fstream>
16
+ #include <iostream>
17
+ #include <string>
18
+ #include <vector>
19
+
20
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
21
+ #include <signal.h>
22
+ #include <unistd.h>
23
+ #elif defined (_WIN32)
24
+ #define WIN32_LEAN_AND_MEAN
25
+ #define NOMINMAX
26
+ #include <windows.h>
27
+ #include <signal.h>
28
+ #endif
29
+
30
+
31
+
32
+ int main(int argc, char ** argv)
33
+ {
34
+ gpt_params params;
35
+
36
+ //---------------------------------
37
+ // Print help :
38
+ //---------------------------------
39
+
40
+ if ( argc == 1 || argv[1][0] == '-' )
41
+ {
42
+ printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
43
+ return 1 ;
44
+ }
45
+
46
+ //---------------------------------
47
+ // Load parameters :
48
+ //---------------------------------
49
+
50
+ if ( argc >= 2 )
51
+ {
52
+ params.model = argv[1];
53
+ }
54
+
55
+ if ( argc >= 3 )
56
+ {
57
+ params.prompt = argv[2];
58
+ }
59
+
60
+ if ( params.prompt.empty() )
61
+ {
62
+ params.prompt = "Hello my name is";
63
+ }
64
+
65
+ //---------------------------------
66
+ // Init LLM :
67
+ //---------------------------------
68
+
69
+ llama_init_backend();
70
+
71
+ llama_context * ctx ;
72
+
73
+ ctx = llama_init_from_gpt_params( params );
74
+
75
+ if ( ctx == NULL )
76
+ {
77
+ fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
78
+ return 1;
79
+ }
80
+
81
+ //---------------------------------
82
+ // Tokenize the prompt :
83
+ //---------------------------------
84
+
85
+ std::vector<llama_token> tokens_list;
86
+ tokens_list = ::llama_tokenize( ctx , params.prompt , true );
87
+
88
+ const int max_context_size = llama_n_ctx( ctx );
89
+ const int max_tokens_list_size = max_context_size - 4 ;
90
+
91
+ if ( (int)tokens_list.size() > max_tokens_list_size )
92
+ {
93
+ fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
94
+ __func__ , (int)tokens_list.size() , max_tokens_list_size );
95
+ return 1;
96
+ }
97
+
98
+ fprintf( stderr, "\n\n" );
99
+
100
+ // Print the tokens from the prompt :
101
+
102
+ for( auto id : tokens_list )
103
+ {
104
+ printf( "%s" , llama_token_to_str( ctx , id ) );
105
+ }
106
+
107
+ fflush(stdout);
108
+
109
+
110
+ //---------------------------------
111
+ // Main prediction loop :
112
+ //---------------------------------
113
+
114
+ // The LLM keeps a contextual cache memory of previous token evaluation.
115
+ // Usually, once this cache is full, it is required to recompute a compressed context based on previous
116
+ // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
117
+ // example, we will just stop the loop once this cache is full or once an end of stream is detected.
118
+
119
+ while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
120
+ {
121
+ //---------------------------------
122
+ // Evaluate the tokens :
123
+ //---------------------------------
124
+
125
+ if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
126
+ {
127
+ fprintf( stderr, "%s : failed to eval\n" , __func__ );
128
+ return 1;
129
+ }
130
+
131
+ tokens_list.clear();
132
+
133
+ //---------------------------------
134
+ // Select the best prediction :
135
+ //---------------------------------
136
+
137
+ llama_token new_token_id = 0;
138
+
139
+ auto logits = llama_get_logits( ctx );
140
+ auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
141
+
142
+ std::vector<llama_token_data> candidates;
143
+ candidates.reserve( n_vocab );
144
+
145
+ for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
146
+ {
147
+ candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
148
+ }
149
+
150
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
151
+
152
+ // Select it using the "Greedy sampling" method :
153
+ new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
154
+
155
+
156
+ // is it an end of stream ?
157
+ if ( new_token_id == llama_token_eos() )
158
+ {
159
+ fprintf(stderr, " [end of text]\n");
160
+ break;
161
+ }
162
+
163
+ // Print the new token :
164
+ printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
165
+ fflush( stdout );
166
+
167
+ // Push this new token for next evaluation :
168
+ tokens_list.push_back( new_token_id );
169
+
170
+ } // wend of main loop
171
+
172
+ llama_free( ctx );
173
+
174
+ return 0;
175
+ }
176
+
177
+ // EOF
examples/train-text-from-scratch/README.md CHANGED
@@ -4,7 +4,7 @@ Basic usage instructions:
4
 
5
  ```bash
6
  # get training data
7
- wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
8
 
9
  # train
10
  ./bin/train-text-from-scratch \
 
4
 
5
  ```bash
6
  # get training data
7
+ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
8
 
9
  # train
10
  ./bin/train-text-from-scratch \
examples/train-text-from-scratch/train-text-from-scratch.cpp CHANGED
@@ -12,6 +12,9 @@
12
  #include <algorithm>
13
  #include <string>
14
 
 
 
 
15
 
16
  struct random_normal_distribution {
17
  std::mt19937 gen;
@@ -20,7 +23,6 @@ struct random_normal_distribution {
20
  float max;
21
  };
22
 
23
-
24
  struct random_uniform_distribution {
25
  std::mt19937 gen;
26
  std::uniform_real_distribution<float> rd;
@@ -2366,7 +2368,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
2366
  file->write_u32(0);
2367
  file->write_u32(0);
2368
  file->write_u32(GGML_TYPE_F32);
2369
- file->seek(-file->tell() & 31, SEEK_CUR);
2370
  return;
2371
  }
2372
  const char * name = ggml_get_name(tensor);
@@ -2381,7 +2383,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
2381
  file->write_u32(tensor->type);
2382
  file->write_raw(ne, sizeof(ne[0]) * nd);
2383
  file->write_raw(name, name_len);
2384
- file->seek(-file->tell() & 31, SEEK_CUR);
2385
  file->write_raw(tensor->data, ggml_nbytes(tensor));
2386
  }
2387
 
@@ -2402,7 +2404,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
2402
  std::string name = file->read_string(name_len);
2403
  GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
2404
 
2405
- file->seek(-file->tell() & 31, SEEK_CUR);
2406
  file->read_raw(tensor->data, ggml_nbytes(tensor));
2407
  }
2408
 
@@ -2756,8 +2758,8 @@ struct train_params get_default_train_params() {
2756
 
2757
  params.lbfgs_n_iter = 16;
2758
  params.adam_n_iter = 16;
2759
- params.adam_alpha = 1e-3;
2760
- params.adam_decay = 1e-3;
2761
 
2762
  params.mem_model_gb = 2;
2763
  params.mem_compute_gb = 24;
@@ -3331,8 +3333,8 @@ int main(int argc, char ** argv) {
3331
  int n_gen = params.n_predict;
3332
  int sample_ctx = n_tokens - n_tokens/8;
3333
 
3334
- sampler.params.temp = 0.2;
3335
- sampler.params.repeat_penalty = 1.1;
3336
  sampler.params.mirostat = 2;
3337
  init_sampler(&sampler, lctx);
3338
 
 
12
  #include <algorithm>
13
  #include <string>
14
 
15
+ #if defined(_MSC_VER)
16
+ #pragma warning(disable: 4244 4267) // possible loss of data
17
+ #endif
18
 
19
  struct random_normal_distribution {
20
  std::mt19937 gen;
 
23
  float max;
24
  };
25
 
 
26
  struct random_uniform_distribution {
27
  std::mt19937 gen;
28
  std::uniform_real_distribution<float> rd;
 
2368
  file->write_u32(0);
2369
  file->write_u32(0);
2370
  file->write_u32(GGML_TYPE_F32);
2371
+ file->seek(0-file->tell() & 31, SEEK_CUR);
2372
  return;
2373
  }
2374
  const char * name = ggml_get_name(tensor);
 
2383
  file->write_u32(tensor->type);
2384
  file->write_raw(ne, sizeof(ne[0]) * nd);
2385
  file->write_raw(name, name_len);
2386
+ file->seek(0-file->tell() & 31, SEEK_CUR);
2387
  file->write_raw(tensor->data, ggml_nbytes(tensor));
2388
  }
2389
 
 
2404
  std::string name = file->read_string(name_len);
2405
  GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
2406
 
2407
+ file->seek(0-file->tell() & 31, SEEK_CUR);
2408
  file->read_raw(tensor->data, ggml_nbytes(tensor));
2409
  }
2410
 
 
2758
 
2759
  params.lbfgs_n_iter = 16;
2760
  params.adam_n_iter = 16;
2761
+ params.adam_alpha = 1e-3f;
2762
+ params.adam_decay = 1e-3f;
2763
 
2764
  params.mem_model_gb = 2;
2765
  params.mem_compute_gb = 24;
 
3333
  int n_gen = params.n_predict;
3334
  int sample_ctx = n_tokens - n_tokens/8;
3335
 
3336
+ sampler.params.temp = 0.2f;
3337
+ sampler.params.repeat_penalty = 1.1f;
3338
  sampler.params.mirostat = 2;
3339
  init_sampler(&sampler, lctx);
3340
 
ggml-cuda.cu CHANGED
@@ -1,5 +1,6 @@
1
  #include <cstddef>
2
  #include <cstdint>
 
3
  #include <stdint.h>
4
  #include <stdio.h>
5
  #include <atomic>
@@ -24,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
24
  } \
25
  } while (0)
26
 
27
- #if CUDART_VERSION >= 12
28
  #define CUBLAS_CHECK(err) \
29
  do { \
30
  cublasStatus_t err_ = (err); \
@@ -48,6 +49,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
48
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
49
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
50
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
 
51
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
52
  typedef void (*ggml_cuda_op_t)(
53
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
@@ -151,7 +153,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
151
  #define CUDA_ADD_BLOCK_SIZE 256
152
  #define CUDA_MUL_BLOCK_SIZE 256
153
  #define CUDA_SILU_BLOCK_SIZE 256
 
 
154
  #define CUDA_ROPE_BLOCK_SIZE 256
 
155
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
156
 
157
  // dmmv = dequantize_mul_mat_vec
@@ -162,6 +167,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
162
  #define GGML_CUDA_DMMV_Y 1
163
  #endif
164
 
 
 
 
 
 
 
165
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
166
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
167
 
@@ -321,37 +332,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
321
 
322
  }
323
 
324
- static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
325
-
326
- const block_q2_K * x = (const block_q2_K *) vx;
327
-
328
- // if n is 0, we want to do the lower 128, else the upper 128,
329
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
330
- // y[l+16], y[l+48], y[l+80], y[l+112]
331
- int n = iqs/128; // 0 or 1
332
- int r = iqs - 128*n; // 0...120 in steps of 8
333
- int l = r/8; // 0...15 in steps of 1
334
-
335
- const float * y = yy + 128*n + l;
336
- const uint8_t * q = x[ib].qs + 32*n + l;
337
- const uint8_t * s = x[ib].scales + 8*n;
338
-
339
- const float dall = x[ib].d;
340
- const float dmin = x[ib].dmin;
341
-
342
- float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
343
- + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
344
- + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
345
- + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
346
- + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
347
- + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
348
- + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
349
- + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
350
-
351
- result = sum;
352
-
353
- }
354
-
355
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
356
 
357
  int r = threadIdx.x/4;
@@ -383,51 +363,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
383
 
384
  }
385
 
386
- static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
387
-
388
- const block_q3_K * x = (const block_q3_K *) vx;
389
-
390
- const uint32_t kmask1 = 0x03030303;
391
- const uint32_t kmask2 = 0x0f0f0f0f;
392
-
393
- uint32_t aux[3];
394
- uint32_t utmp[4];
395
-
396
- // if n is 0, we want to do the lower 128, else the upper 128,
397
- // covering y[l+0], y[l+32], y[l+64], y[l+96] and
398
- // y[l+16], y[l+48], y[l+80], y[l+112]
399
- int n = iqs/128; // 0 or 1
400
- int r = iqs - 128*n; // 0...120 in steps of 8
401
- int l = r/8; // 0...15 in steps of 1
402
-
403
- const float * y = yy + 128*n + l;
404
- const uint8_t * q = x[ib].qs + 32*n + l;
405
- const uint8_t * hm = x[ib].hmask + l;
406
- const int8_t * s = (const int8_t *)utmp + 8*n;
407
-
408
- memcpy(aux, x[ib].scales, 12);
409
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
410
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
411
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
412
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
413
-
414
- const float dall = x[ib].d;
415
-
416
- const uint8_t m = 1 << (4*n);
417
-
418
- float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
419
- + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
420
- + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
421
- + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
422
- + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
423
- + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
424
- + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
425
- + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
426
-
427
- result = sum * dall;
428
-
429
- }
430
-
431
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
432
  if (j < 4) {
433
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -474,38 +409,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
474
  }
475
  }
476
 
477
- static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
478
-
479
- const block_q4_K * x = (const block_q4_K *) vx;
480
-
481
- // iqs is in 0...248 in steps of 8 =>
482
- const int j = iqs / 64; // j is in 0...3
483
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
484
- const int is = 2*j; // is is in 0...6 in steps of 2
485
-
486
- const float * y = yy + 64*j + ir;
487
- const uint8_t * q = x[ib].qs + 32*j + ir;
488
-
489
- const float dall = x[ib].d;
490
- const float dmin = x[ib].dmin;
491
-
492
- uint8_t sc, m;
493
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
494
- const float d1 = dall * sc;
495
- const float m1 = dmin * m;
496
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
497
- const float d2 = dall * sc;
498
- const float m2 = dmin * m;
499
-
500
- float sum = 0;
501
- for (int k = 0; k < 4; ++k) {
502
- sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
503
- sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
504
- }
505
- result = sum;
506
-
507
- }
508
-
509
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
510
  const block_q5_K * x = (const block_q5_K *) vx;
511
 
@@ -539,43 +442,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
539
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
540
  }
541
 
542
- static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
543
-
544
- const block_q5_K * x = (const block_q5_K *) vx;
545
-
546
- // iqs is in 0...248 in steps of 8 =>
547
- const int j = iqs / 64; // j is in 0...3
548
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
549
- const int is = 2*j; // is is in 0...6 in steps of 2
550
-
551
- const float * y = yy + 64*j + ir;
552
- const uint8_t * ql = x[ib].qs + 32*j + ir;
553
- const uint8_t * qh = x[ib].qh + ir;
554
-
555
- const float dall = x[ib].d;
556
- const float dmin = x[ib].dmin;
557
-
558
- uint8_t sc, m;
559
- get_scale_min_k4(is + 0, x[ib].scales, sc, m);
560
- const float d1 = dall * sc;
561
- const float m1 = dmin * m;
562
- get_scale_min_k4(is + 1, x[ib].scales, sc, m);
563
- const float d2 = dall * sc;
564
- const float m2 = dmin * m;
565
-
566
- uint8_t hm = 1 << is;
567
- float sum = 0;
568
- for (int k = 0; k < 4; ++k) {
569
- sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
570
- }
571
- hm <<= 1;
572
- for (int k = 0; k < 4; ++k) {
573
- sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
574
- }
575
- result = sum;
576
-
577
- }
578
-
579
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
580
  const block_q6_K * x = (const block_q6_K *) vx;
581
 
@@ -601,31 +467,376 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
601
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
602
  }
603
 
604
- static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
605
 
606
- const block_q6_K * x = (const block_q6_K *) vx;
607
 
608
- const int ip = iqs / 128; // 0 or 1
609
- const int il = (iqs - 128*ip)/8; // 0...15
610
- const int is = 8*ip;
611
 
612
- const float * y = yy + 128*ip + il;
 
613
 
614
- const float d = x[ib].d;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
 
616
- const uint8_t * ql = x[ib].ql + 64*ip + il;
617
- const uint8_t * qh = x[ib].qh + 32*ip + il;
618
- const int8_t * sc = x[ib].scales + is;
619
 
620
- result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
621
- + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
622
- + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
623
- + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
624
- + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
625
- + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
626
- + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
627
- + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
628
 
 
 
 
629
  }
630
 
631
  static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
@@ -655,10 +866,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
655
  }
656
 
657
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
658
- static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
659
  // qk = quantized weights per x block
660
  // qr = number of quantized weights per data value in x block
661
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
 
 
 
 
 
662
  const int tid = threadIdx.x;
663
 
664
  const int iter_stride = 2*GGML_CUDA_DMMV_X;
@@ -702,27 +918,85 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
702
  }
703
  }
704
 
705
- template <int n_thread, dot_kernel_k_t dot_kernel>
706
- static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols) {
707
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
708
- const int tid = threadIdx.x;
709
 
710
- const int iter_stride = QK_K;
711
- const int vals_per_iter = iter_stride / n_thread;
712
- const int num_blocks_per_row = ncols / QK_K;
713
- const int ib0 = row*num_blocks_per_row;
714
 
715
- float tmp = 0; // partial sum for thread in warp
 
 
716
 
717
- for (int i = 0; i < ncols; i += iter_stride) {
718
- const int col = i + vals_per_iter*tid;
719
- const int ib = ib0 + col/QK_K; // x block index
720
- const int iqs = col%QK_K; // x quant index
721
- const int iybs = col - col%QK_K; // y block start index
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
 
723
- float v;
724
- dot_kernel(vx, ib, iqs, y + iybs, v);
725
- tmp += v;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  }
727
 
728
  // sum up partial sums and write back result
@@ -732,11 +1006,51 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y
732
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
733
  }
734
 
735
- if (tid == 0) {
736
- dst[row] = tmp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
  }
739
 
 
740
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
741
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
742
 
@@ -758,6 +1072,72 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
758
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
759
  }
760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
762
  const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
763
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
@@ -831,73 +1211,83 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
831
 
832
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
833
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
834
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
 
835
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
836
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
837
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
838
  }
839
 
840
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
841
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
842
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
 
843
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
844
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
845
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
846
  }
847
 
848
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
849
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
850
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
 
851
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
852
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
853
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
854
  }
855
 
856
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
857
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
858
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
 
859
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
860
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
861
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
862
  }
863
 
864
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
865
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
866
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
 
867
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
868
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
869
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
870
  }
871
 
872
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
873
  GGML_ASSERT(ncols % QK_K == 0);
874
  const int ny = 2;
 
 
875
  const dim3 block_dims(32, ny, 1);
876
- dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols);
877
  }
878
 
879
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
880
  GGML_ASSERT(ncols % QK_K == 0);
881
- const dim3 block_dims(32, 2, 1);
882
- dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
883
  }
884
 
885
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
886
  GGML_ASSERT(ncols % QK_K == 0);
887
- const dim3 block_dims(32, 2, 1);
888
- dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
889
  }
890
 
891
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
892
  GGML_ASSERT(ncols % QK_K == 0);
893
- const dim3 block_dims(32, 2, 1);
894
- dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
895
  }
896
 
897
  static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
898
  GGML_ASSERT(ncols % QK_K == 0);
899
- const dim3 block_dims(32, 2, 1);
900
- dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<nrows/2, block_dims, 0, stream>>>(vx, y, dst, ncols);
 
 
 
901
  }
902
 
903
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -907,10 +1297,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
907
 
908
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
909
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
910
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
 
911
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
912
  dequantize_mul_mat_vec<1, 1, convert_f16>
913
- <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
914
  }
915
 
916
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
@@ -942,6 +1333,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
942
  }
943
  }
944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
946
  GGML_ASSERT(nrows % 2 == 0);
947
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -950,6 +1382,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
950
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
951
  }
952
 
 
 
 
 
 
 
 
 
 
 
 
 
 
953
  // buffer pool for cuda
954
  #define MAX_CUDA_BUFFERS 256
955
 
@@ -1120,10 +1565,25 @@ void ggml_cuda_host_free(void * ptr) {
1120
  CUDA_CHECK(cudaFreeHost(ptr));
1121
  }
1122
 
1123
- static cudaError_t ggml_cuda_h2d_tensor_2d(
1124
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1125
 
1126
- char * dst_char = (char *) dst;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1127
  const int64_t ne0 = src->ne[0];
1128
  const int64_t nb0 = src->nb[0];
1129
  const int64_t nb1 = src->nb[1];
@@ -1134,17 +1594,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(
1134
  const int64_t bs = ggml_blck_size(type);
1135
  int64_t i1_diff = i1_high - i1_low;
1136
 
1137
- const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3);
1138
  if (nb0 == ts && nb1 == ts*ne0/bs) {
1139
- return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream);
1140
  } else if (nb0 == ts) {
1141
- return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream);
1142
  } else {
1143
  for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1144
  const void * rx = (const void *) ((const char *) x + i1*nb1);
1145
- void * rd = (void *) (dst_char + i1*ts*ne0/bs);
1146
  // pretend the row is a matrix with cols=1
1147
- cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
1148
  if (r != cudaSuccess) return r;
1149
  }
1150
  return cudaSuccess;
@@ -1380,8 +1840,81 @@ inline void ggml_cuda_op_rope(
1380
  (void) i1;
1381
  }
1382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1383
  static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
1384
- ggml_cuda_op_t op, bool src0_needs_f32) {
1385
  const int64_t ne00 = src0->ne[0];
1386
  const int64_t ne01 = src0->ne[1];
1387
  const int64_t ne02 = src0->ne[2];
@@ -1404,21 +1937,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1404
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
1405
 
1406
  // strides for iteration over dims 3 and 2
1407
- const int64_t src0_stride = ne00 * ne01;
1408
- const int64_t src1_stride = ne10 * ne11;
1409
- const int64_t dst_stride = ne0 * ne1;
1410
- const int64_t num_iters = ne02 * ne03;
 
1411
 
1412
  const size_t src0_ts = ggml_type_size(src0->type);
1413
  const size_t src0_bs = ggml_blck_size(src0->type);
1414
 
1415
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
1416
  struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
1417
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
1418
 
1419
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
 
1420
  const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
1421
 
 
 
 
 
1422
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
1423
 
1424
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@@ -1427,13 +1966,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1427
  char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
1428
  float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
1429
  float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1430
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1431
 
1432
  // asq = actual size quantized, asf = actual size float
1433
  size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
1434
  size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
1435
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1436
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
1437
 
1438
  for (int id = 0; id < g_device_count; ++id) {
1439
  if (!split && id != g_main_device) {
@@ -1446,9 +1985,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1446
  int64_t row_low, row_high;
1447
  if (split) {
1448
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
1449
- row_low -= row_low % GGML_CUDA_DMMV_Y;
1450
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
1451
- row_high -= row_high % GGML_CUDA_DMMV_Y;
1452
  } else {
1453
  row_low = 0;
1454
  row_high = nrows0;
@@ -1461,7 +1998,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1461
 
1462
  cudaSetDevice(id);
1463
 
1464
- if (src0_on_device) {
1465
  if (src0_is_f32) {
1466
  src0_ddf[id] = (float *) src0_extra->data_device[id];
1467
  } else {
@@ -1479,8 +2016,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1479
  src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
1480
  }
1481
 
1482
- if (use_src1) {
1483
- if (src1_on_device) {
1484
  src1_ddf[id] = (float *) src1_extra->data_device[id];
1485
  } else {
1486
  src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
@@ -1493,26 +2030,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1493
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
1494
  }
1495
 
1496
- for (int64_t i03 = 0; i03 < ne03; i03++) {
 
 
 
 
1497
  const int64_t i13 = i03 % ne13;
1498
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1499
  const int64_t i12 = i02 % ne12;
1500
 
1501
  const int64_t i0 = i03*ne02 + i02;
1502
- const int64_t i0_offset_low = row_low/ne01;
1503
- const int64_t i0_offset_high = row_high/ne01;
 
 
1504
 
1505
  int64_t i01_low = 0;
1506
- int64_t i01_high = ne01;
1507
  if (split) {
1508
  if (i0 < i0_offset_low || i0 > i0_offset_high) {
1509
  continue;
1510
  }
1511
  if (i0 == i0_offset_low) {
1512
- i01_low = row_low % ne01;
1513
  }
1514
  if (i0 == i0_offset_high) {
1515
- i01_high = row_high % ne01;
1516
  }
1517
  }
1518
 
@@ -1521,7 +2064,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1521
  // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
1522
  // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
1523
  GGML_ASSERT(i01_low == 0 || g_device_count > 1);
1524
- GGML_ASSERT(i01_high == ne01 || g_device_count > 1);
1525
 
1526
  const int64_t i01_diff = i01_high - i01_low;
1527
  if (i01_diff == 0) {
@@ -1529,24 +2072,23 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1529
  }
1530
  const int64_t i11 = i13*ne12 + i12;
1531
 
1532
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
1533
  cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
1534
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
1535
 
1536
  // for split tensors the data begins at i0 == i0_offset_low
1537
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
1538
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
1539
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
1540
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
1541
 
1542
  // for split tensors the data pointer needs to be rounded down
1543
  // to the bin edge for i03, i02 bins beyond the first
1544
  if (i0 - i0_offset_low > 0) {
 
1545
  src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
1546
  src0_ddf_i -= (row_low % ne01)*ne00;
1547
- }
1548
- if (i0 - i0_offset_low > 0) {
1549
- dst_ddf_i -= (row_low % ne0)*ne1;
1550
  }
1551
 
1552
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
@@ -1556,30 +2098,37 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1556
  }
1557
 
1558
  // copy src0, src1 to device if necessary
1559
- if (use_src1) {
1560
  if (src1->backend == GGML_BACKEND_CPU) {
1561
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1));
1562
- } else if (src1->backend == GGML_BACKEND_GPU) {
 
 
1563
  if (id != g_main_device) {
 
1564
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
1565
  src1_ddf_i_source += i11*src1_stride;
1566
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
1567
  cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
1568
  }
 
 
 
1569
  } else {
1570
  GGML_ASSERT(false);
1571
  }
1572
  }
1573
  CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
1574
- if (!src0_on_device) {
 
1575
  if (src0_is_f32) {
1576
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1577
  } else {
1578
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
1579
  }
1580
  }
1581
 
1582
- // convert src0 to f32 if it's necessary for the ggml_cuda_op
1583
  if (src0_needs_f32 && !src0_is_f32) {
1584
  to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
1585
  CUDA_CHECK(cudaGetLastError());
@@ -1644,39 +2193,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1644
 
1645
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1646
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1647
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
1648
  }
1649
 
1650
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1651
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1652
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
1653
  }
1654
 
1655
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1656
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1657
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
1658
  }
1659
 
1660
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1661
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1662
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
1663
  }
1664
 
1665
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1666
- GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
1667
  const int64_t ne10 = src1->ne[0];
1668
 
1669
  const int64_t ne0 = dst->ne[0];
1670
  const int64_t ne1 = dst->ne[1];
1671
 
1672
- // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
1673
- // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
1674
- // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
1675
- // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
1676
- // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
1677
- // return false;
1678
- // }
1679
-
1680
  // TODO: find the optimal values for these
1681
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1682
  src1->type == GGML_TYPE_F32 &&
@@ -1688,23 +2228,158 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
1688
  return false;
1689
  }
1690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1691
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1692
- if (src0->type == GGML_TYPE_F32) {
1693
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
 
 
 
 
 
 
 
1694
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
1695
- if (src1->ne[1] == 1) {
1696
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
1697
  } else {
1698
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
1699
  }
1700
  } else {
1701
  GGML_ASSERT(false);
1702
  }
1703
  }
1704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1705
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1706
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1707
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
1708
  }
1709
 
1710
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1718,10 +2393,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
1718
  const size_t nb1 = tensor->nb[1];
1719
  ggml_backend backend = tensor->backend;
1720
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
 
1721
 
1722
  for (int id = 0; id < g_device_count; ++id) {
1723
- extra->data_device[id] = nullptr;
1724
-
1725
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
1726
  continue;
1727
  }
@@ -1734,10 +2408,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
1734
  row_high = nrows;
1735
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
1736
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
1737
- row_low -= row_low % GGML_CUDA_DMMV_Y;
1738
  row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
1739
- row_high -= row_high % GGML_CUDA_DMMV_Y;
1740
- GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
1741
  } else {
1742
  GGML_ASSERT(false);
1743
  }
@@ -1781,47 +2452,78 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
1781
  delete extra;
1782
  }
1783
 
1784
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
1785
- if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
1786
- ggml_cuda_assign_buffers(tensor);
1787
  }
1788
 
1789
- const size_t size = ggml_nbytes(tensor);
1790
- GGML_ASSERT(size <= g_scratch_size);
1791
- if (g_scratch_offset + size > g_scratch_size) {
1792
- g_scratch_offset = 0;
 
 
 
 
 
1793
  }
1794
 
1795
  tensor->backend = GGML_BACKEND_GPU;
1796
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
1797
 
1798
- bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data;
 
 
1799
 
1800
  CUDA_CHECK(cudaSetDevice(g_main_device));
1801
  if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
1802
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
1803
- extra->data_device[g_main_device] = src0_extra->data_device;
1804
- GGML_ASSERT(false);
1805
- } else {
 
 
 
 
 
 
 
 
 
 
 
 
 
1806
  char * data = (char *) g_scratch_buffer;
1807
  if (data == nullptr) {
1808
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
1809
  g_scratch_buffer = data;
1810
  }
1811
  extra->data_device[g_main_device] = data + g_scratch_offset;
1812
- }
1813
 
1814
- // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]);
1815
- g_scratch_offset += size;
1816
- // fprintf(stderr, "%s: scratch %d, %p - %p\n",
1817
- // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
 
 
 
 
 
1818
 
1819
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
1820
  tensor->extra = extra;
1821
  }
1822
 
 
 
 
 
 
 
 
 
1823
  void ggml_cuda_set_main_device(int main_device) {
1824
- if (main_device > g_device_count) {
1825
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
1826
  main_device, g_device_count, g_main_device);
1827
  return;
@@ -1838,6 +2540,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
1838
  g_scratch_size = scratch_size;
1839
  }
1840
 
 
 
 
 
 
 
 
 
 
1841
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
1842
  ggml_cuda_func_t func;
1843
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
@@ -1875,12 +2586,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
1875
  }
1876
  func = ggml_cuda_mul_mat;
1877
  break;
 
 
 
 
 
 
 
 
 
 
 
 
1878
  case GGML_OP_RESHAPE:
 
 
 
1879
  if (!any_on_device) {
1880
  return false;
1881
  }
1882
  func = ggml_cuda_nop;
1883
  break;
 
 
 
 
 
 
 
 
 
 
 
 
1884
  case GGML_OP_ROPE:
1885
  if (!any_on_device) {
1886
  return false;
 
1
  #include <cstddef>
2
  #include <cstdint>
3
+ #include <limits>
4
  #include <stdint.h>
5
  #include <stdio.h>
6
  #include <atomic>
 
25
  } \
26
  } while (0)
27
 
28
+ #if CUDART_VERSION >= 12000
29
  #define CUBLAS_CHECK(err) \
30
  do { \
31
  cublasStatus_t err_ = (err); \
 
49
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
50
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
51
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
52
+ typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
53
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
54
  typedef void (*ggml_cuda_op_t)(
55
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
 
153
  #define CUDA_ADD_BLOCK_SIZE 256
154
  #define CUDA_MUL_BLOCK_SIZE 256
155
  #define CUDA_SILU_BLOCK_SIZE 256
156
+ #define CUDA_CPY_BLOCK_SIZE 32
157
+ #define CUDA_SCALE_BLOCK_SIZE 256
158
  #define CUDA_ROPE_BLOCK_SIZE 256
159
+ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
160
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
161
 
162
  // dmmv = dequantize_mul_mat_vec
 
167
  #define GGML_CUDA_DMMV_Y 1
168
  #endif
169
 
170
+ #ifndef K_QUANTS_PER_ITERATION
171
+ #define K_QUANTS_PER_ITERATION 2
172
+ #else
173
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
174
+ #endif
175
+
176
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
177
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
178
 
 
332
 
333
  }
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
336
 
337
  int r = threadIdx.x/4;
 
363
 
364
  }
365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
367
  if (j < 4) {
368
  d = q[j] & 63; m = q[j + 4] & 63;
 
409
  }
410
  }
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
413
  const block_q5_K * x = (const block_q5_K *) vx;
414
 
 
442
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
443
  }
444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
446
  const block_q6_K * x = (const block_q6_K *) vx;
447
 
 
467
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
468
  }
469
 
470
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
471
 
472
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
473
 
474
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
475
+ if (row > nrows) return;
 
476
 
477
+ const int num_blocks_per_row = ncols / QK_K;
478
+ const int ib0 = row*num_blocks_per_row;
479
 
480
+ const block_q2_K * x = (const block_q2_K *)vx + ib0;
481
+
482
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31
483
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0
484
+
485
+ const int step = 16/K_QUANTS_PER_ITERATION;
486
+
487
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
488
+ const int in = tid - step*im; // 0...7
489
+
490
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4
491
+ const int q_offset = 32*im + l0;
492
+ const int s_offset = 8*im;
493
+ const int y_offset = 128*im + l0;
494
+
495
+ float tmp = 0; // partial sum for thread in warp
496
+
497
+ uint32_t aux[4];
498
+ const uint8_t * d = (const uint8_t *)aux;
499
+ const uint8_t * m = (const uint8_t *)(aux + 2);
500
+
501
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
502
+
503
+ const float * y = yy + i * QK_K + y_offset;
504
+ const uint8_t * q = x[i].qs + q_offset;
505
+
506
+ const float dall = x[i].d;
507
+ const float dmin = x[i].dmin;
508
+
509
+ const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
510
+ aux[0] = a[0] & 0x0f0f0f0f;
511
+ aux[1] = a[1] & 0x0f0f0f0f;
512
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
513
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
514
+
515
+ float sum1 = 0, sum2 = 0;
516
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
517
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
518
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
519
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
520
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
521
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
522
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
523
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
524
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
525
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
526
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
527
+
528
+ }
529
+ tmp += dall * sum1 - dmin * sum2;
530
+
531
+ }
532
+
533
+ // sum up partial sums and write back result
534
+ __syncthreads();
535
+ #pragma unroll
536
+ for (int mask = 16; mask > 0; mask >>= 1) {
537
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
538
+ }
539
+
540
+ if (tid == 0) {
541
+ dst[row] = tmp;
542
+ }
543
+ }
544
+
545
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) {
546
+
547
+ const uint16_t kmask1 = 0x0303;
548
+ const uint16_t kmask2 = 0x0f0f;
549
+
550
+ const int row = blockIdx.x;
551
+ const int num_blocks_per_row = ncols / QK_K;
552
+ const int ib0 = row*num_blocks_per_row;
553
+
554
+ const block_q3_K * x = (const block_q3_K *)vx + ib0;
555
+
556
+ const int tid = threadIdx.x/2; // 0...15
557
+ const int ix = threadIdx.x%2; // 0, 1
558
+
559
+ const int n = 2; // iterations in the inner loop
560
+ const int im = tid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
561
+ const int in = tid - 8*im; // 0...7
562
+
563
+ const uint8_t m = 1 << (4*im);
564
+
565
+ const int l0 = n*in; // 0...28 in steps of 4
566
+ const int q_offset = 32*im + l0;
567
+ const int y_offset = 128*im + l0;
568
+
569
+ uint16_t utmp[4];
570
+ const int8_t * s = (const int8_t *)utmp;
571
+
572
+ const uint16_t s_shift = 4*im;
573
+
574
+ float tmp = 0; // partial sum for thread in warp
575
+
576
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
577
+
578
+ const float * y = yy + i * QK_K + y_offset;
579
+ const uint8_t * q = x[i].qs + q_offset;
580
+ const uint8_t * h = x[i].hmask + l0;
581
+
582
+ const uint16_t * a = (const uint16_t *)x[i].scales;
583
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
584
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
585
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
586
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
587
+
588
+ const float d = x[i].d;
589
+
590
+ float sum = 0;
591
+ for (int l = 0; l < n; ++l) {
592
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
593
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
594
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
595
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
596
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
597
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
598
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
599
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
600
+ }
601
+ tmp += d * sum;
602
+
603
+ }
604
+
605
+ // sum up partial sums and write back result
606
+ __syncthreads();
607
+ #pragma unroll
608
+ for (int mask = 16; mask > 0; mask >>= 1) {
609
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
610
+ }
611
+
612
+ if (tid == 0) {
613
+ dst[row] = tmp;
614
+ }
615
+ }
616
+
617
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) {
618
+
619
+ const uint16_t kmask1 = 0x3f3f;
620
+ const uint16_t kmask2 = 0x0f0f;
621
+ const uint16_t kmask3 = 0xc0c0;
622
+
623
+ const int row = blockIdx.x;
624
+ const int num_blocks_per_row = ncols / QK_K;
625
+ const int ib0 = row*num_blocks_per_row;
626
+
627
+ const int tid = threadIdx.x/2; // 0...15
628
+ const int ix = threadIdx.x%2;
629
+
630
+ const int il = tid/4; // 0...3
631
+ const int ir = tid - 4*il;// 0...3
632
+ const int n = 4;
633
+
634
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
635
+ const int in = il%2;
636
+
637
+ const int l0 = n*(2*ir + in);
638
+ const int q_offset = 32*im + l0;
639
+ const int y_offset = 64*im + l0;
640
+
641
+ uint16_t aux[4];
642
+ const uint8_t * sc = (const uint8_t *)aux;
643
+
644
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
645
+
646
+ float tmp = 0; // partial sum for thread in warp
647
+
648
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
649
+
650
+ const uint8_t * q1 = x[i].qs + q_offset;
651
+ const uint8_t * q2 = q1 + 64;
652
+ const float * y1 = yy + i*QK_K + y_offset;
653
+ const float * y2 = y1 + 128;
654
+
655
+ const float dall = x[i].d;
656
+ const float dmin = x[i].dmin;
657
+
658
+ const uint16_t * a = (const uint16_t *)x[i].scales;
659
+ aux[0] = a[im+0] & kmask1;
660
+ aux[1] = a[im+2] & kmask1;
661
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
662
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
663
+
664
+ float4 s = {0.f, 0.f, 0.f, 0.f};
665
+ float smin = 0;
666
+ for (int l = 0; l < n; ++l) {
667
+ s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
668
+ s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
669
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
670
+ }
671
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
672
+
673
+ }
674
+
675
+ // sum up partial sums and write back result
676
+ __syncthreads();
677
+ #pragma unroll
678
+ for (int mask = 16; mask > 0; mask >>= 1) {
679
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
680
+ }
681
+
682
+ if (tid == 0) {
683
+ dst[row] = tmp;
684
+ }
685
+ }
686
+
687
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
688
+
689
+ const uint16_t kmask1 = 0x3f3f;
690
+ const uint16_t kmask2 = 0x0f0f;
691
+ const uint16_t kmask3 = 0xc0c0;
692
+
693
+ //const int row = blockIdx.x*blockDim.y + threadIdx.y;
694
+ const int row = blockIdx.x;
695
+ const int num_blocks_per_row = ncols / QK_K;
696
+ const int ib0 = row*num_blocks_per_row;
697
+
698
+ const int tid = threadIdx.x/2; // 0...15
699
+ const int ix = threadIdx.x%2;
700
+
701
+ const int il = tid/4; // 0...3
702
+ const int ir = tid - 4*il;// 0...3
703
+ const int n = 4;
704
+
705
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
706
+ const int in = il%2;
707
+
708
+ const int l0 = n*(2*ir + in);
709
+ const int q_offset = 32*im + l0;
710
+ const int y_offset = 64*im + l0;
711
+
712
+ const uint8_t hm1 = 1 << (2*im);
713
+ const uint8_t hm2 = hm1 << 4;
714
+
715
+ uint16_t aux[4];
716
+ const uint8_t * sc = (const uint8_t *)aux;
717
+
718
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
719
+
720
+ float tmp = 0; // partial sum for thread in warp
721
+
722
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
723
+
724
+ const uint8_t * ql1 = x[i].qs + q_offset;
725
+ const uint8_t * ql2 = ql1 + 64;
726
+ const uint8_t * qh = x[i].qh + l0;
727
+ const float * y1 = yy + i*QK_K + y_offset;
728
+ const float * y2 = y1 + 128;
729
+
730
+ const float dall = x[i].d;
731
+ const float dmin = x[i].dmin;
732
+
733
+ const uint16_t * a = (const uint16_t *)x[i].scales;
734
+ aux[0] = a[im+0] & kmask1;
735
+ aux[1] = a[im+2] & kmask1;
736
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
737
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
738
+
739
+ float4 sum = {0.f, 0.f, 0.f, 0.f};
740
+ float smin = 0;
741
+ for (int l = 0; l < n; ++l) {
742
+ sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0));
743
+ sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0));
744
+ sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0));
745
+ sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0));
746
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
747
+ }
748
+ tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
749
+
750
+ }
751
+
752
+ // sum up partial sums and write back result
753
+ __syncthreads();
754
+ #pragma unroll
755
+ for (int mask = 16; mask > 0; mask >>= 1) {
756
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
757
+ }
758
+
759
+ if (tid == 0) {
760
+ dst[row] = tmp;
761
+ }
762
+ }
763
+
764
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
765
+
766
+ static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
767
+
768
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
769
+ if (row > nrows) return;
770
+
771
+ const int num_blocks_per_row = ncols / QK_K;
772
+ const int ib0 = row*num_blocks_per_row;
773
+
774
+ const block_q6_K * x = (const block_q6_K *)vx + ib0;
775
+
776
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
777
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
778
+
779
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
780
+
781
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
782
+ const int in = tid - step*im; // 0...15 or 0...7
783
+
784
+ #if K_QUANTS_PER_ITERATION == 1
785
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
786
+ const int is = 0;
787
+ #else
788
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
789
+ const int is = in / 4;
790
+ #endif
791
+ const int ql_offset = 64*im + l0;
792
+ const int qh_offset = 32*im + l0;
793
+ const int s_offset = 8*im + is;
794
+ const int y_offset = 128*im + l0;
795
+
796
+ float tmp = 0; // partial sum for thread in warp
797
+
798
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
799
+
800
+ const float * y = yy + i * QK_K + y_offset;
801
+ const uint8_t * ql = x[i].ql + ql_offset;
802
+ const uint8_t * qh = x[i].qh + qh_offset;
803
+ const int8_t * s = x[i].scales + s_offset;
804
+
805
+ const float d = x[i].d;
806
+
807
+ #if K_QUANTS_PER_ITERATION == 1
808
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
809
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
810
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
811
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
812
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
813
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
814
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
815
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
816
+ tmp += sum;
817
+ #else
818
+ float sum = 0;
819
+ for (int l = 0; l < 4; ++l) {
820
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
821
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
822
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
823
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
824
+ }
825
+ tmp += sum;
826
+ #endif
827
 
828
+ }
 
 
829
 
830
+ // sum up partial sums and write back result
831
+ __syncthreads();
832
+ #pragma unroll
833
+ for (int mask = 16; mask > 0; mask >>= 1) {
834
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
835
+ }
 
 
836
 
837
+ if (tid == 0) {
838
+ dst[row] = tmp;
839
+ }
840
  }
841
 
842
  static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
 
866
  }
867
 
868
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
869
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
870
  // qk = quantized weights per x block
871
  // qr = number of quantized weights per data value in x block
872
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
873
+
874
+ if (row >= nrows) {
875
+ return;
876
+ }
877
+
878
  const int tid = threadIdx.x;
879
 
880
  const int iter_stride = 2*GGML_CUDA_DMMV_X;
 
918
  }
919
  }
920
 
921
+ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
922
+ const half * x = (half *) vx;
 
 
923
 
924
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
925
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
 
 
926
 
927
+ const int nrows_y = ncols_x;
928
+ const int nrows_dst = nrows_x;
929
+ const int row_dst = row_x;
930
 
931
+ float tmp = 0.0f;
932
+
933
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
934
+ const int col_x = col_x0 + threadIdx.x;
935
+
936
+ if (col_x >= ncols_x) {
937
+ break;
938
+ }
939
+
940
+ // x is transposed and permuted
941
+ const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
942
+ const float xi = __half2float(x[ix]);
943
+
944
+ const int row_y = col_x;
945
+
946
+
947
+ // y is not transposed but permuted
948
+ const int iy = channel*nrows_y + row_y;
949
+
950
+ tmp += xi * y[iy];
951
+ }
952
+
953
+ // dst is not transposed and not permuted
954
+ const int idst = channel*nrows_dst + row_dst;
955
+
956
+ // sum up partial sums and write back result
957
+ __syncthreads();
958
+ #pragma unroll
959
+ for (int mask = 16; mask > 0; mask >>= 1) {
960
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
961
+ }
962
+
963
+ if (threadIdx.x == 0) {
964
+ dst[idst] = tmp;
965
+ }
966
+ }
967
+
968
+ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
969
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
970
+ const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
971
+
972
+ const half * x = (half *) vx;
973
+
974
+ const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
975
+ const int channel = blockDim.z*blockIdx.z + threadIdx.z;
976
+
977
+ const int nrows_y = ncols_x;
978
+ const int nrows_dst = nrows_x;
979
+ const int row_dst = row_x;
980
 
981
+ const int idst = channel*nrows_dst + row_dst;
982
+
983
+ float tmp = 0.0f;
984
+
985
+ for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
986
+ const int col_x = col_x0 + threadIdx.x;
987
+
988
+ if (col_x >= ncols_x) {
989
+ break;
990
+ }
991
+
992
+ const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
993
+ const float xi = __half2float(x[ix]);
994
+
995
+ const int row_y = col_x;
996
+
997
+ const int iy = channel*nrows_y + row_y;
998
+
999
+ tmp += xi * y[iy];
1000
  }
1001
 
1002
  // sum up partial sums and write back result
 
1006
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1007
  }
1008
 
1009
+ if (threadIdx.x == 0) {
1010
+ dst[idst] = tmp;
1011
+ }
1012
+ }
1013
+
1014
+ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1015
+ const float * xi = (float *) cxi;
1016
+ float * dsti = (float *) cdsti;
1017
+
1018
+ *dsti = *xi;
1019
+ }
1020
+
1021
+ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1022
+ const float * xi = (float *) cxi;
1023
+ half * dsti = (half *) cdsti;
1024
+
1025
+ *dsti = __float2half(*xi);
1026
+ }
1027
+
1028
+ template <cpy_kernel_t cpy_1>
1029
+ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
1030
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1031
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
1032
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1033
+
1034
+ if (i >= ne) {
1035
+ return;
1036
  }
1037
+
1038
+ // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
1039
+ // then combine those indices with the corresponding byte offsets to get the total offsets
1040
+ const int i02 = i / (ne00*ne01);
1041
+ const int i01 = (i - i02*ne01*ne00) / ne00;
1042
+ const int i00 = i - i02*ne01*ne00 - i01*ne00;
1043
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
1044
+
1045
+ const int i12 = i / (ne10*ne11);
1046
+ const int i11 = (i - i12*ne10*ne11) / ne10;
1047
+ const int i10 = i - i12*ne10*ne11 - i11*ne10;
1048
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
1049
+
1050
+ cpy_1(cx + x_offset, cdst + dst_offset);
1051
  }
1052
 
1053
+ // rope == RoPE == rotary positional embedding
1054
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
1055
  const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1056
 
 
1072
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
1073
  }
1074
 
1075
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
1076
+ const int col = blockDim.x*blockIdx.x + threadIdx.x;
1077
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1078
+
1079
+ if (col >= ncols) {
1080
+ return;
1081
+ }
1082
+
1083
+ const int i = row*ncols + col;
1084
+ // dst[i] = col > n_past + row ? -INFINITY : x[i];
1085
+ dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
1086
+ }
1087
+
1088
+ // the CUDA soft max implementation differs from the CPU implementation
1089
+ // instead of doubles floats are used
1090
+ // values are also not normalized to the maximum value by subtracting it in the exponential function
1091
+ // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
1092
+ static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
1093
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1094
+ const int block_size = blockDim.x;
1095
+ const int tid = threadIdx.x;
1096
+
1097
+ float tmp = 0.0;
1098
+
1099
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
1100
+ const int col = block_start + tid;
1101
+
1102
+ if (col >= ncols) {
1103
+ break;
1104
+ }
1105
+
1106
+ const int i = row*ncols + col;
1107
+ const float val = expf(x[i]);
1108
+ tmp += val;
1109
+ dst[i] = val;
1110
+ }
1111
+
1112
+ // sum up partial sums
1113
+ __syncthreads();
1114
+ #pragma unroll
1115
+ for (int mask = 16; mask > 0; mask >>= 1) {
1116
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1117
+ }
1118
+
1119
+ for (int block_start = 0; block_start < ncols; block_start += block_size) {
1120
+ const int col = block_start + tid;
1121
+
1122
+ if (col >= ncols) {
1123
+ break;
1124
+ }
1125
+
1126
+ const int i = row*ncols + col;
1127
+ dst[i] /= tmp;
1128
+ }
1129
+ }
1130
+
1131
+ static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
1132
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
1133
+
1134
+ if (i >= k) {
1135
+ return;
1136
+ }
1137
+
1138
+ dst[i] = scale * x[i];
1139
+ }
1140
+
1141
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
1142
  const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1143
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 
1211
 
1212
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1213
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1214
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1215
+ const dim3 block_nums(1, block_num_y, 1);
1216
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1217
  dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
1218
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1219
  }
1220
 
1221
  static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1222
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1223
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1224
+ const dim3 block_nums(1, block_num_y, 1);
1225
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1226
  dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
1227
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1228
  }
1229
 
1230
  static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1231
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1232
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1233
+ const dim3 block_nums(1, block_num_y, 1);
1234
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1235
  dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
1236
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1237
  }
1238
 
1239
  static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1240
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1241
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1242
+ const dim3 block_nums(1, block_num_y, 1);
1243
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1244
  dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
1245
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1246
  }
1247
 
1248
  static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1249
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1250
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1251
+ const dim3 block_nums(1, block_num_y, 1);
1252
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1253
  dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
1254
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1255
  }
1256
 
1257
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1258
  GGML_ASSERT(ncols % QK_K == 0);
1259
  const int ny = 2;
1260
+ const int block_num_y = (nrows + ny - 1) / ny;
1261
+ const dim3 block_nums(1, block_num_y, 1);
1262
  const dim3 block_dims(32, ny, 1);
1263
+ dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1264
  }
1265
 
1266
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1267
  GGML_ASSERT(ncols % QK_K == 0);
1268
+ const dim3 block_dims(32, 1, 1);
1269
+ dequantize_mul_mat_vec_q3_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
1270
  }
1271
 
1272
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1273
  GGML_ASSERT(ncols % QK_K == 0);
1274
+ const dim3 block_dims(32, 1, 1);
1275
+ dequantize_mul_mat_vec_q4_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
1276
  }
1277
 
1278
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1279
  GGML_ASSERT(ncols % QK_K == 0);
1280
+ const dim3 block_dims(32, 1, 1);
1281
+ dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
1282
  }
1283
 
1284
  static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1285
  GGML_ASSERT(ncols % QK_K == 0);
1286
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1287
+ const int block_num_y = (nrows + ny - 1) / ny;
1288
+ const dim3 block_nums(1, block_num_y, 1);
1289
+ const dim3 block_dims(32, ny, 1);
1290
+ dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1291
  }
1292
 
1293
  static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
 
1297
 
1298
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1299
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1300
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1301
+ const dim3 block_nums(1, block_num_y, 1);
1302
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1303
  dequantize_mul_mat_vec<1, 1, convert_f16>
1304
+ <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1305
  }
1306
 
1307
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 
1333
  }
1334
  }
1335
 
1336
+ static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
1337
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1338
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1339
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
1340
+ }
1341
+
1342
+ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1343
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
1344
+ const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
1345
+
1346
+ const dim3 block_nums(1, nrows_x, nchannels_x);
1347
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1348
+ mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1349
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1350
+ }
1351
+
1352
+ static void ggml_cpy_f32_f32_cuda(
1353
+ const char * cx, char * cdst, const int ne,
1354
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1355
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1356
+
1357
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1358
+ cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1359
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1360
+ }
1361
+
1362
+ static void ggml_cpy_f32_f16_cuda(
1363
+ const char * cx, char * cdst, const int ne,
1364
+ const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
1365
+ const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
1366
+
1367
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
1368
+ cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
1369
+ (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
1370
+ }
1371
+
1372
+ static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
1373
+ const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
1374
+ scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
1375
+ }
1376
+
1377
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
1378
  GGML_ASSERT(nrows % 2 == 0);
1379
  const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
 
1382
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
1383
  }
1384
 
1385
+ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
1386
+ const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
1387
+ const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
1388
+ const dim3 block_nums(block_num_x, nrows_x, 1);
1389
+ diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
1390
+ }
1391
+
1392
+ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
1393
+ const dim3 block_dims(WARP_SIZE, 1, 1);
1394
+ const dim3 block_nums(1, nrows_x, 1);
1395
+ soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
1396
+ }
1397
+
1398
  // buffer pool for cuda
1399
  #define MAX_CUDA_BUFFERS 256
1400
 
 
1565
  CUDA_CHECK(cudaFreeHost(ptr));
1566
  }
1567
 
1568
+ static cudaError_t ggml_cuda_cpy_tensor_2d(
1569
  void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
1570
 
1571
+ cudaMemcpyKind kind;
1572
+ char * src_ptr;
1573
+ if (src->backend == GGML_BACKEND_CPU) {
1574
+ kind = cudaMemcpyHostToDevice;
1575
+ src_ptr = (char *) src->data;
1576
+ } else if (src->backend == GGML_BACKEND_GPU) {
1577
+ kind = cudaMemcpyDeviceToDevice;
1578
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
1579
+ int id;
1580
+ CUDA_CHECK(cudaGetDevice(&id));
1581
+ src_ptr = (char *) extra->data_device[id];
1582
+ } else {
1583
+ GGML_ASSERT(false);
1584
+ }
1585
+ char * dst_ptr = (char *) dst;
1586
+
1587
  const int64_t ne0 = src->ne[0];
1588
  const int64_t nb0 = src->nb[0];
1589
  const int64_t nb1 = src->nb[1];
 
1594
  const int64_t bs = ggml_blck_size(type);
1595
  int64_t i1_diff = i1_high - i1_low;
1596
 
1597
+ const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1598
  if (nb0 == ts && nb1 == ts*ne0/bs) {
1599
+ return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
1600
  } else if (nb0 == ts) {
1601
+ return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
1602
  } else {
1603
  for (int64_t i1 = 0; i1 < i1_diff; i1++) {
1604
  const void * rx = (const void *) ((const char *) x + i1*nb1);
1605
+ void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1606
  // pretend the row is a matrix with cols=1
1607
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
1608
  if (r != cudaSuccess) return r;
1609
  }
1610
  return cudaSuccess;
 
1840
  (void) i1;
1841
  }
1842
 
1843
+ inline void ggml_cuda_op_diag_mask_inf(
1844
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1845
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1846
+ cudaStream_t & cudaStream_main){
1847
+
1848
+ GGML_ASSERT(src0_ddf_i != nullptr);
1849
+ GGML_ASSERT(dst_ddf_i != nullptr);
1850
+
1851
+ const int64_t ne00 = src0->ne[0];
1852
+ const int64_t ne01 = src0->ne[1];
1853
+ const int64_t i01_diff = i01_high - i01_low;
1854
+
1855
+ const int n_past = ((int32_t *) src1->data)[0];
1856
+
1857
+ // compute
1858
+ diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
1859
+ CUDA_CHECK(cudaGetLastError());
1860
+
1861
+ (void) dst;
1862
+ (void) src0_ddq_i;
1863
+ (void) src1_ddf_i;
1864
+ (void) i02;
1865
+ (void) i1;
1866
+ }
1867
+
1868
+ inline void ggml_cuda_op_soft_max(
1869
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1870
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1871
+ cudaStream_t & cudaStream_main){
1872
+
1873
+ GGML_ASSERT(src0_ddf_i != nullptr);
1874
+ GGML_ASSERT(dst_ddf_i != nullptr);
1875
+
1876
+ const int64_t ne00 = src0->ne[0];
1877
+ const int64_t i01_diff = i01_high - i01_low;
1878
+
1879
+ // compute
1880
+ soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
1881
+ CUDA_CHECK(cudaGetLastError());
1882
+
1883
+ (void) src1;
1884
+ (void) dst;
1885
+ (void) src0_ddq_i;
1886
+ (void) src1_ddf_i;
1887
+ (void) i02;
1888
+ (void) i1;
1889
+ }
1890
+
1891
+ inline void ggml_cuda_op_scale(
1892
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
1893
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1894
+ cudaStream_t & cudaStream_main){
1895
+
1896
+ GGML_ASSERT(src0_ddf_i != nullptr);
1897
+ GGML_ASSERT(dst_ddf_i != nullptr);
1898
+
1899
+ const float scale = ((float *) src1->data)[0];
1900
+
1901
+ const int64_t ne00 = src0->ne[0];
1902
+ const int64_t i01_diff = i01_high - i01_low;
1903
+
1904
+ // compute
1905
+ scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
1906
+ CUDA_CHECK(cudaGetLastError());
1907
+
1908
+ (void) src1;
1909
+ (void) dst;
1910
+ (void) src0_ddq_i;
1911
+ (void) src1_ddf_i;
1912
+ (void) i02;
1913
+ (void) i1;
1914
+ }
1915
+
1916
  static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
1917
+ ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
1918
  const int64_t ne00 = src0->ne[0];
1919
  const int64_t ne01 = src0->ne[1];
1920
  const int64_t ne02 = src0->ne[2];
 
1937
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
1938
 
1939
  // strides for iteration over dims 3 and 2
1940
+ const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
1941
+ const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
1942
+ const int64_t src0_stride = ne00 * ne01 * stride_mod;
1943
+ const int64_t src1_stride = ne10 * ne11 * stride_mod;
1944
+ const int64_t dst_stride = ne0 * ne1 * stride_mod;
1945
 
1946
  const size_t src0_ts = ggml_type_size(src0->type);
1947
  const size_t src0_bs = ggml_blck_size(src0->type);
1948
 
1949
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
1950
  struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
1951
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
1952
 
1953
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
1954
+ const bool src0_is_contiguous = ggml_is_contiguous(src0);
1955
  const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
1956
 
1957
+ const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
1958
+ const bool src1_stays_on_host = use_src1 && (
1959
+ dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
1960
+
1961
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
1962
 
1963
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
 
1966
  char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
1967
  float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
1968
  float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1969
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
1970
 
1971
  // asq = actual size quantized, asf = actual size float
1972
  size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
1973
  size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
1974
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1975
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
1976
 
1977
  for (int id = 0; id < g_device_count; ++id) {
1978
  if (!split && id != g_main_device) {
 
1985
  int64_t row_low, row_high;
1986
  if (split) {
1987
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
 
1988
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
 
1989
  } else {
1990
  row_low = 0;
1991
  row_high = nrows0;
 
1998
 
1999
  cudaSetDevice(id);
2000
 
2001
+ if (src0_on_device && src0_is_contiguous) {
2002
  if (src0_is_f32) {
2003
  src0_ddf[id] = (float *) src0_extra->data_device[id];
2004
  } else {
 
2016
  src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
2017
  }
2018
 
2019
+ if (use_src1 && !src1_stays_on_host) {
2020
+ if (src1_on_device && src1_is_contiguous) {
2021
  src1_ddf[id] = (float *) src1_extra->data_device[id];
2022
  } else {
2023
  src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
 
2030
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
2031
  }
2032
 
2033
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
2034
+ const int64_t i02_max = flatten_rows ? 1 : ne02;
2035
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
2036
+
2037
+ for (int64_t i03 = 0; i03 < i03_max; i03++) {
2038
  const int64_t i13 = i03 % ne13;
2039
+ for (int64_t i02 = 0; i02 < i02_max; i02++) {
2040
  const int64_t i12 = i02 % ne12;
2041
 
2042
  const int64_t i0 = i03*ne02 + i02;
2043
+
2044
+ // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
2045
+ const int64_t i0_offset_low = row_low/rows_per_iter;
2046
+ const int64_t i0_offset_high = row_high/rows_per_iter;
2047
 
2048
  int64_t i01_low = 0;
2049
+ int64_t i01_high = rows_per_iter;
2050
  if (split) {
2051
  if (i0 < i0_offset_low || i0 > i0_offset_high) {
2052
  continue;
2053
  }
2054
  if (i0 == i0_offset_low) {
2055
+ i01_low = row_low % rows_per_iter;
2056
  }
2057
  if (i0 == i0_offset_high) {
2058
+ i01_high = row_high % rows_per_iter;
2059
  }
2060
  }
2061
 
 
2064
  // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
2065
  // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
2066
  GGML_ASSERT(i01_low == 0 || g_device_count > 1);
2067
+ GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
2068
 
2069
  const int64_t i01_diff = i01_high - i01_low;
2070
  if (i01_diff == 0) {
 
2072
  }
2073
  const int64_t i11 = i13*ne12 + i12;
2074
 
2075
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2076
  cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2077
+ cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2078
 
2079
  // for split tensors the data begins at i0 == i0_offset_low
2080
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
2081
  float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
2082
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
2083
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
2084
 
2085
  // for split tensors the data pointer needs to be rounded down
2086
  // to the bin edge for i03, i02 bins beyond the first
2087
  if (i0 - i0_offset_low > 0) {
2088
+ GGML_ASSERT(!flatten_rows);
2089
  src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
2090
  src0_ddf_i -= (row_low % ne01)*ne00;
2091
+ dst_ddf_i -= (row_low % ne0)*ne1;
 
 
2092
  }
2093
 
2094
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
 
2098
  }
2099
 
2100
  // copy src0, src1 to device if necessary
2101
+ if (use_src1 && !src1_stays_on_host) {
2102
  if (src1->backend == GGML_BACKEND_CPU) {
2103
+ GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
2104
+ int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2105
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
2106
+ } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
2107
  if (id != g_main_device) {
2108
+ GGML_ASSERT(!flatten_rows);
2109
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
2110
  src1_ddf_i_source += i11*src1_stride;
2111
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
2112
  cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2113
  }
2114
+ } else if (src1_on_device && !src1_is_contiguous) {
2115
+ GGML_ASSERT(!split);
2116
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
2117
  } else {
2118
  GGML_ASSERT(false);
2119
  }
2120
  }
2121
  CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
2122
+
2123
+ if (!src0_on_device || !src0_is_contiguous) {
2124
  if (src0_is_f32) {
2125
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
2126
  } else {
2127
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
2128
  }
2129
  }
2130
 
2131
+ // convert src0 to f32 if it is necessary for the ggml_cuda_op
2132
  if (src0_needs_f32 && !src0_is_f32) {
2133
  to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
2134
  CUDA_CHECK(cudaGetLastError());
 
2193
 
2194
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2195
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2196
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
2197
  }
2198
 
2199
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2200
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2201
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
2202
  }
2203
 
2204
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2205
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2206
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
2207
  }
2208
 
2209
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2210
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2211
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
2212
  }
2213
 
2214
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
 
2215
  const int64_t ne10 = src1->ne[0];
2216
 
2217
  const int64_t ne0 = dst->ne[0];
2218
  const int64_t ne1 = dst->ne[1];
2219
 
 
 
 
 
 
 
 
 
2220
  // TODO: find the optimal values for these
2221
  if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2222
  src1->type == GGML_TYPE_F32 &&
 
2228
  return false;
2229
  }
2230
 
2231
+ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2232
+ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
2233
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2234
+ GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
2235
+ GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
2236
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2237
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2238
+
2239
+ const int64_t ne00 = src0->ne[0];
2240
+ const int64_t ne01 = src0->ne[1];
2241
+ const int64_t ne02 = src0->ne[2];
2242
+
2243
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2244
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2245
+
2246
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2247
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2248
+
2249
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2250
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2251
+
2252
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2253
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2254
+
2255
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2256
+
2257
+ CUDA_CHECK(cudaDeviceSynchronize());
2258
+ }
2259
+
2260
+ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
2261
+ GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
2262
+ GGML_ASSERT(!ggml_is_permuted(src0));
2263
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
2264
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
2265
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
2266
+
2267
+ const int64_t ne00 = src0->ne[0];
2268
+ const int64_t ne01 = src0->ne[1];
2269
+ const int64_t ne02 = src0->ne[2];
2270
+
2271
+ const int64_t nb01 = src0->nb[1];
2272
+ const int64_t nb02 = src0->nb[2];
2273
+
2274
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2275
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2276
+
2277
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2278
+ void * src0_ddq = src0_extra->data_device[g_main_device];
2279
+
2280
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2281
+ float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
2282
+
2283
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2284
+ float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2285
+
2286
+ const int row_stride_x = nb01 / sizeof(half);
2287
+ const int channel_stride_x = nb02 / sizeof(half);
2288
+
2289
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2290
+
2291
+ CUDA_CHECK(cudaDeviceSynchronize());
2292
+ }
2293
+
2294
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2295
+ bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
2296
+ src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
2297
+
2298
+ if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
2299
+ ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
2300
+ } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
2301
+ ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
2302
+ }else if (src0->type == GGML_TYPE_F32) {
2303
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2304
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
2305
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2306
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
2307
  } else {
2308
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
2309
  }
2310
  } else {
2311
  GGML_ASSERT(false);
2312
  }
2313
  }
2314
 
2315
+ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2316
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2317
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
2318
+ }
2319
+
2320
+ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2321
+ const int64_t ne = ggml_nelements(src0);
2322
+ GGML_ASSERT(ne == ggml_nelements(src1));
2323
+
2324
+ GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
2325
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
2326
+
2327
+ GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
2328
+ GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
2329
+
2330
+ const int64_t ne00 = src0->ne[0];
2331
+ const int64_t ne01 = src0->ne[1];
2332
+ GGML_ASSERT(src0->ne[3] == 1);
2333
+
2334
+ const int64_t nb00 = src0->nb[0];
2335
+ const int64_t nb01 = src0->nb[1];
2336
+ const int64_t nb02 = src0->nb[2];
2337
+
2338
+ const int64_t ne10 = src1->ne[0];
2339
+ const int64_t ne11 = src1->ne[1];
2340
+ GGML_ASSERT(src1->ne[3] == 1);
2341
+
2342
+ const int64_t nb10 = src1->nb[0];
2343
+ const int64_t nb11 = src1->nb[1];
2344
+ const int64_t nb12 = src1->nb[2];
2345
+
2346
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2347
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2348
+
2349
+ const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2350
+ const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2351
+
2352
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2353
+ char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
2354
+
2355
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
2356
+ ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2357
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2358
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
2359
+ ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
2360
+ ne10, ne11, nb10, nb11, nb12, cudaStream_main);
2361
+ } else {
2362
+ GGML_ASSERT(false);
2363
+ }
2364
+
2365
+ CUDA_CHECK(cudaDeviceSynchronize());
2366
+
2367
+ (void) dst;
2368
+ }
2369
+
2370
+ void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2371
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2372
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
2373
+ }
2374
+
2375
+ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2376
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2377
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
2378
+ }
2379
+
2380
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2381
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2382
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results
2383
  }
2384
 
2385
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
2393
  const size_t nb1 = tensor->nb[1];
2394
  ggml_backend backend = tensor->backend;
2395
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
2396
+ memset(extra, 0, sizeof(*extra));
2397
 
2398
  for (int id = 0; id < g_device_count; ++id) {
 
 
2399
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
2400
  continue;
2401
  }
 
2408
  row_high = nrows;
2409
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
2410
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
 
2411
  row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
 
 
2412
  } else {
2413
  GGML_ASSERT(false);
2414
  }
 
2452
  delete extra;
2453
  }
2454
 
2455
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2456
+ if (scratch && g_scratch_size == 0) {
2457
+ return;
2458
  }
2459
 
2460
+ // recursively assign CUDA buffers until a compute tensor is found
2461
+ if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2462
+ const ggml_op src0_op = tensor->src0->op;
2463
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2464
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2465
+ }
2466
+ }
2467
+ if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2468
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
2469
  }
2470
 
2471
  tensor->backend = GGML_BACKEND_GPU;
2472
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2473
 
2474
+ const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2475
+ tensor->op == GGML_OP_VIEW;
2476
+ const size_t size = ggml_nbytes(tensor);
2477
 
2478
  CUDA_CHECK(cudaSetDevice(g_main_device));
2479
  if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2480
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2481
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2482
+ size_t offset = 0;
2483
+ if (tensor->op == GGML_OP_VIEW) {
2484
+ memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
2485
+ }
2486
+ extra->data_device[g_main_device] = src0_ddc + offset;
2487
+ } else if (tensor->op == GGML_OP_CPY) {
2488
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
2489
+ void * src1_ddv = src1_extra->data_device[g_main_device];
2490
+ extra->data_device[g_main_device] = src1_ddv;
2491
+ } else if (scratch) {
2492
+ GGML_ASSERT(size <= g_scratch_size);
2493
+ if (g_scratch_offset + size > g_scratch_size) {
2494
+ g_scratch_offset = 0;
2495
+ }
2496
+
2497
  char * data = (char *) g_scratch_buffer;
2498
  if (data == nullptr) {
2499
  CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
2500
  g_scratch_buffer = data;
2501
  }
2502
  extra->data_device[g_main_device] = data + g_scratch_offset;
 
2503
 
2504
+ g_scratch_offset += size;
2505
+
2506
+ GGML_ASSERT(g_scratch_offset <= g_scratch_size);
2507
+ } else { // allocate new buffers outside of scratch
2508
+ void * data;
2509
+ CUDA_CHECK(cudaMalloc(&data, size));
2510
+ CUDA_CHECK(cudaMemset(data, 0, size));
2511
+ extra->data_device[g_main_device] = data;
2512
+ }
2513
 
 
2514
  tensor->extra = extra;
2515
  }
2516
 
2517
+ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2518
+ ggml_cuda_assign_buffers_impl(tensor, true);
2519
+ }
2520
+
2521
+ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2522
+ ggml_cuda_assign_buffers_impl(tensor, false);
2523
+ }
2524
+
2525
  void ggml_cuda_set_main_device(int main_device) {
2526
+ if (main_device >= g_device_count) {
2527
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
2528
  main_device, g_device_count, g_main_device);
2529
  return;
 
2540
  g_scratch_size = scratch_size;
2541
  }
2542
 
2543
+ void ggml_cuda_free_scratch() {
2544
+ if (g_scratch_buffer == nullptr) {
2545
+ return;
2546
+ }
2547
+
2548
+ CUDA_CHECK(cudaFree(g_scratch_buffer));
2549
+ g_scratch_buffer = nullptr;
2550
+ }
2551
+
2552
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2553
  ggml_cuda_func_t func;
2554
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
 
2586
  }
2587
  func = ggml_cuda_mul_mat;
2588
  break;
2589
+ case GGML_OP_SCALE:
2590
+ if (!any_on_device) {
2591
+ return false;
2592
+ }
2593
+ func = ggml_cuda_scale;
2594
+ break;
2595
+ case GGML_OP_CPY:
2596
+ if (!any_on_device) {
2597
+ return false;
2598
+ }
2599
+ func = ggml_cuda_cpy;
2600
+ break;
2601
  case GGML_OP_RESHAPE:
2602
+ case GGML_OP_VIEW:
2603
+ case GGML_OP_PERMUTE:
2604
+ case GGML_OP_TRANSPOSE:
2605
  if (!any_on_device) {
2606
  return false;
2607
  }
2608
  func = ggml_cuda_nop;
2609
  break;
2610
+ case GGML_OP_DIAG_MASK_INF:
2611
+ if (!any_on_device) {
2612
+ return false;
2613
+ }
2614
+ func = ggml_cuda_diag_mask_inf;
2615
+ break;
2616
+ case GGML_OP_SOFT_MAX:
2617
+ if (!any_on_device) {
2618
+ return false;
2619
+ }
2620
+ func = ggml_cuda_soft_max;
2621
+ break;
2622
  case GGML_OP_ROPE:
2623
  if (!any_on_device) {
2624
  return false;
ggml-cuda.h CHANGED
@@ -28,8 +28,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
28
 
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 
31
  void ggml_cuda_set_main_device(int main_device);
32
  void ggml_cuda_set_scratch_size(size_t scratch_size);
 
33
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
34
 
35
  #ifdef __cplusplus
 
28
 
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
+ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
32
  void ggml_cuda_set_main_device(int main_device);
33
  void ggml_cuda_set_scratch_size(size_t scratch_size);
34
+ void ggml_cuda_free_scratch(void);
35
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
36
 
37
  #ifdef __cplusplus
ggml-metal.h CHANGED
@@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
55
  void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
56
 
57
  // same as ggml_graph_compute but uses Metal
 
58
  void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
59
 
60
  #ifdef __cplusplus
 
55
  void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
56
 
57
  // same as ggml_graph_compute but uses Metal
58
+ // creates gf->n_threads command buffers in parallel
59
  void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
60
 
61
  #ifdef __cplusplus
ggml-metal.m CHANGED
@@ -57,6 +57,7 @@ struct ggml_metal_context {
57
  GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
  GGML_METAL_DECL_KERNEL(get_rows_q6_k);
59
  GGML_METAL_DECL_KERNEL(rms_norm);
 
60
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
61
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
@@ -66,8 +67,10 @@ struct ggml_metal_context {
66
  GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
67
  GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
68
  GGML_METAL_DECL_KERNEL(rope);
 
69
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
70
  GGML_METAL_DECL_KERNEL(cpy_f32_f32);
 
71
 
72
  #undef GGML_METAL_DECL_KERNEL
73
  };
@@ -162,6 +165,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
162
  GGML_METAL_ADD_KERNEL(get_rows_q5_k);
163
  GGML_METAL_ADD_KERNEL(get_rows_q6_k);
164
  GGML_METAL_ADD_KERNEL(rms_norm);
 
165
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
166
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
167
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
@@ -171,8 +175,10 @@ struct ggml_metal_context * ggml_metal_init(void) {
171
  GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
172
  GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
173
  GGML_METAL_ADD_KERNEL(rope);
 
174
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
175
  GGML_METAL_ADD_KERNEL(cpy_f32_f32);
 
176
 
177
  #undef GGML_METAL_ADD_KERNEL
178
  }
@@ -284,528 +290,618 @@ void ggml_metal_get_tensor(
284
 
285
  void ggml_metal_graph_compute(
286
  struct ggml_metal_context * ctx,
287
- struct ggml_cgraph * gf) {
288
  metal_printf("%s: evaluating graph\n", __func__);
289
 
290
- size_t offs_src0 = 0;
291
- size_t offs_src1 = 0;
292
- size_t offs_dst = 0;
293
-
294
- id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
295
- id<MTLComputeCommandEncoder> encoder = nil;
296
-
297
- for (int i = 0; i < gf->n_nodes; ++i) {
298
- //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
299
-
300
- struct ggml_tensor * src0 = gf->nodes[i]->src0;
301
- struct ggml_tensor * src1 = gf->nodes[i]->src1;
302
- struct ggml_tensor * dst = gf->nodes[i];
303
-
304
- const int64_t ne00 = src0 ? src0->ne[0] : 0;
305
- const int64_t ne01 = src0 ? src0->ne[1] : 0;
306
- const int64_t ne02 = src0 ? src0->ne[2] : 0;
307
- const int64_t ne03 = src0 ? src0->ne[3] : 0;
308
-
309
- const uint64_t nb00 = src0 ? src0->nb[0] : 0;
310
- const uint64_t nb01 = src0 ? src0->nb[1] : 0;
311
- const uint64_t nb02 = src0 ? src0->nb[2] : 0;
312
- const uint64_t nb03 = src0 ? src0->nb[3] : 0;
313
-
314
- const int64_t ne10 = src1 ? src1->ne[0] : 0;
315
- const int64_t ne11 = src1 ? src1->ne[1] : 0;
316
- const int64_t ne12 = src1 ? src1->ne[2] : 0;
317
- const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
318
-
319
- const uint64_t nb10 = src1 ? src1->nb[0] : 0;
320
- const uint64_t nb11 = src1 ? src1->nb[1] : 0;
321
- const uint64_t nb12 = src1 ? src1->nb[2] : 0;
322
- const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
323
-
324
- const int64_t ne0 = dst ? dst->ne[0] : 0;
325
- const int64_t ne1 = dst ? dst->ne[1] : 0;
326
- const int64_t ne2 = dst ? dst->ne[2] : 0;
327
- const int64_t ne3 = dst ? dst->ne[3] : 0;
328
-
329
- const uint64_t nb0 = dst ? dst->nb[0] : 0;
330
- const uint64_t nb1 = dst ? dst->nb[1] : 0;
331
- const uint64_t nb2 = dst ? dst->nb[2] : 0;
332
- const uint64_t nb3 = dst ? dst->nb[3] : 0;
333
-
334
- const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
335
- const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
336
- const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
337
-
338
- id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
339
- id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
340
- id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
341
-
342
- //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
343
- //if (src0) {
344
- // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
345
- // ggml_is_contiguous(src0), src0->name);
346
- //}
347
- //if (src1) {
348
- // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
349
- // ggml_is_contiguous(src1), src1->name);
350
- //}
351
- //if (dst) {
352
- // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
353
- // dst->name);
354
- //}
355
-
356
- switch (dst->op) {
357
- case GGML_OP_RESHAPE:
358
- case GGML_OP_VIEW:
359
- case GGML_OP_TRANSPOSE:
360
- case GGML_OP_PERMUTE:
361
- {
362
- // noop
363
- } break;
364
- case GGML_OP_ADD:
365
- {
366
- if (encoder == nil) {
367
- encoder = [command_buffer computeCommandEncoder];
368
- }
369
-
370
- [encoder setComputePipelineState:ctx->pipeline_add];
371
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
372
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
373
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
374
-
375
- const int64_t n = ggml_nelements(dst);
376
-
377
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
378
- } break;
379
- case GGML_OP_MUL:
380
- {
381
- if (encoder == nil) {
382
- encoder = [command_buffer computeCommandEncoder];
383
- }
384
-
385
- if (ggml_nelements(src1) == ne10) {
386
- // src1 is a row
387
- [encoder setComputePipelineState:ctx->pipeline_mul_row];
388
- } else {
389
- [encoder setComputePipelineState:ctx->pipeline_mul];
390
- }
391
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
392
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
393
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
394
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
395
-
396
- const int64_t n = ggml_nelements(dst);
397
-
398
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
399
- } break;
400
- case GGML_OP_SCALE:
401
- {
402
- if (encoder == nil) {
403
- encoder = [command_buffer computeCommandEncoder];
404
- }
405
-
406
- const float scale = *(const float *) src1->data;
407
-
408
- [encoder setComputePipelineState:ctx->pipeline_scale];
409
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
410
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
411
- [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
412
-
413
- const int64_t n = ggml_nelements(dst);
414
-
415
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
416
- } break;
417
- case GGML_OP_SILU:
418
- {
419
- if (encoder == nil) {
420
- encoder = [command_buffer computeCommandEncoder];
421
- }
422
-
423
- [encoder setComputePipelineState:ctx->pipeline_silu];
424
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
425
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
426
-
427
- const int64_t n = ggml_nelements(dst);
428
-
429
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
430
- } break;
431
- case GGML_OP_RELU:
432
- {
433
- if (encoder == nil) {
434
- encoder = [command_buffer computeCommandEncoder];
435
- }
436
-
437
- [encoder setComputePipelineState:ctx->pipeline_relu];
438
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
439
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
440
-
441
- const int64_t n = ggml_nelements(dst);
442
-
443
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
444
- } break;
445
- case GGML_OP_GELU:
446
- {
447
- if (encoder == nil) {
448
- encoder = [command_buffer computeCommandEncoder];
449
- }
450
-
451
- [encoder setComputePipelineState:ctx->pipeline_gelu];
452
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
453
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
454
-
455
- const int64_t n = ggml_nelements(dst);
456
-
457
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
458
- } break;
459
- case GGML_OP_SOFT_MAX:
460
- {
461
- if (encoder == nil) {
462
- encoder = [command_buffer computeCommandEncoder];
463
- }
464
-
465
- const int nth = 32;
466
-
467
- [encoder setComputePipelineState:ctx->pipeline_soft_max];
468
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
469
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
470
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
471
- [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
472
- [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
473
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
474
-
475
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
476
- } break;
477
- case GGML_OP_DIAG_MASK_INF:
478
- {
479
- if (encoder == nil) {
480
- encoder = [command_buffer computeCommandEncoder];
481
- }
482
-
483
- const int n_past = ((int32_t *)(src1->data))[0];
484
-
485
- [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
486
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
487
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
488
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
489
- [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
490
- [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
491
-
492
- [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
493
- } break;
494
- case GGML_OP_MUL_MAT:
495
- {
496
- // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
497
-
498
- GGML_ASSERT(ne00 == ne10);
499
- GGML_ASSERT(ne02 == ne12);
500
-
501
- if (ggml_is_contiguous(src0) &&
502
- ggml_is_contiguous(src1) &&
503
- (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
504
-
505
- if (encoder != nil) {
506
- [encoder endEncoding];
507
- encoder = nil;
508
- }
509
-
510
- MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
511
- MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
512
-
513
- // for F32 x F32 we use MPS
514
- MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
515
- matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
516
-
517
- MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
518
- matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
519
-
520
- MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
521
- matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
522
-
523
- MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
524
- initWithDevice:ctx->device transposeLeft:false transposeRight:true
525
- resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
526
-
527
- // we need to do ne02 multiplications
528
- // TODO: is there a way to do this in parallel - currently very slow ..
529
- // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
530
- for (int64_t i02 = 0; i02 < ne02; ++i02) {
531
- size_t offs_src0_cur = offs_src0 + i02*nb02;
532
- size_t offs_src1_cur = offs_src1 + i02*nb12;
533
- size_t offs_dst_cur = offs_dst + i02*nb2;
534
-
535
- MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
536
- MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
537
- MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
538
-
539
- [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
540
- }
541
- } else {
542
- if (encoder == nil) {
543
- encoder = [command_buffer computeCommandEncoder];
544
- }
545
-
546
- int nth0 = 32;
547
- int nth1 = 1;
548
-
549
- // use custom matrix x vector kernel
550
- switch (src0t) {
551
- case GGML_TYPE_F16:
552
- {
553
- GGML_ASSERT(ne02 == ne12);
554
-
555
- nth0 = 64;
556
- nth1 = 1;
557
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
558
- } break;
559
- case GGML_TYPE_Q4_0:
560
- {
561
- GGML_ASSERT(ne02 == 1);
562
- GGML_ASSERT(ne12 == 1);
563
-
564
- nth0 = 8;
565
- nth1 = 8;
566
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
567
- } break;
568
- case GGML_TYPE_Q4_1:
569
- {
570
- GGML_ASSERT(ne02 == 1);
571
- GGML_ASSERT(ne12 == 1);
572
-
573
- nth0 = 8;
574
- nth1 = 8;
575
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
576
- } break;
577
- case GGML_TYPE_Q2_K:
578
- {
579
- GGML_ASSERT(ne02 == 1);
580
- GGML_ASSERT(ne12 == 1);
581
-
582
- nth0 = 4;
583
- nth1 = 16;
584
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
585
- } break;
586
- case GGML_TYPE_Q3_K:
587
- {
588
- GGML_ASSERT(ne02 == 1);
589
- GGML_ASSERT(ne12 == 1);
590
-
591
- nth0 = 4;
592
- nth1 = 16;
593
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
594
- } break;
595
- case GGML_TYPE_Q4_K:
596
- {
597
- GGML_ASSERT(ne02 == 1);
598
- GGML_ASSERT(ne12 == 1);
599
-
600
- nth0 = 4;
601
- nth1 = 16;
602
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
603
- } break;
604
- case GGML_TYPE_Q5_K:
605
- {
606
- GGML_ASSERT(ne02 == 1);
607
- GGML_ASSERT(ne12 == 1);
608
-
609
- nth0 = 4;
610
- nth1 = 16;
611
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
612
- } break;
613
- case GGML_TYPE_Q6_K:
614
- {
615
- GGML_ASSERT(ne02 == 1);
616
- GGML_ASSERT(ne12 == 1);
617
-
618
- nth0 = 4;
619
- nth1 = 16;
620
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
621
- } break;
622
- default:
623
- {
624
- fprintf(stderr, "Asserting on type %d\n",(int)src0t);
625
- GGML_ASSERT(false && "not implemented");
626
- }
627
- };
628
-
629
-
630
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
631
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
632
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
633
- [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
634
- [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
635
- [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
636
- [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
637
- [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
638
- [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
639
- [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
640
- [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
641
- [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
642
- [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
643
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
644
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
645
-
646
- if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
647
- [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
648
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
649
- }
650
- else if (src0t == GGML_TYPE_Q2_K ||
651
- src0t == GGML_TYPE_Q3_K ||
652
- src0t == GGML_TYPE_Q4_K ||
653
- src0t == GGML_TYPE_Q5_K ||
654
- src0t == GGML_TYPE_Q6_K) {
655
- [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
656
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
657
- } else {
658
- [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
659
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
660
- }
661
- }
662
- } break;
663
- case GGML_OP_GET_ROWS:
664
- {
665
- if (encoder == nil) {
666
- encoder = [command_buffer computeCommandEncoder];
667
- }
668
-
669
- switch (src0->type) {
670
- case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
671
- case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
672
- case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
673
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
674
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
675
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
676
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
677
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
678
- default: GGML_ASSERT(false && "not implemented");
679
- }
680
-
681
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
682
- [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
683
- [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
684
- [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
685
- [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
686
- [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
687
-
688
- const int64_t n = ggml_nelements(src1);
689
-
690
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
691
- } break;
692
- case GGML_OP_RMS_NORM:
693
- {
694
- if (encoder == nil) {
695
- encoder = [command_buffer computeCommandEncoder];
696
- }
697
-
698
- const float eps = 1e-6f;
699
-
700
- const int nth = 256;
701
-
702
- [encoder setComputePipelineState:ctx->pipeline_rms_norm];
703
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
704
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
705
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
706
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
707
- [encoder setBytes:&eps length:sizeof( float) atIndex:4];
708
- [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
709
-
710
- const int64_t nrows = ggml_nrows(src0);
711
-
712
- [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
713
- } break;
714
- case GGML_OP_ROPE:
715
- {
716
- if (encoder == nil) {
717
- encoder = [command_buffer computeCommandEncoder];
718
- }
719
-
720
- const int n_dims = ((int32_t *) src1->data)[1];
721
- const int mode = ((int32_t *) src1->data)[2];
722
-
723
- const int n_past = ((int32_t *)(src1->data))[0];
724
-
725
- [encoder setComputePipelineState:ctx->pipeline_rope];
726
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
727
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
728
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
729
- [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
730
- [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
731
- [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
732
- [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
733
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
734
- [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
735
- [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
736
- [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
737
- [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
738
- [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
739
- [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
740
- [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
741
- [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
742
- [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
743
- [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
744
- [encoder setBytes:&n_past length:sizeof( int) atIndex:18];
745
- [encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
746
- [encoder setBytes:&mode length:sizeof( int) atIndex:20];
747
-
748
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
749
- } break;
750
- case GGML_OP_CPY:
751
- {
752
- if (encoder == nil) {
753
- encoder = [command_buffer computeCommandEncoder];
754
- }
755
-
756
- const int nth = 32;
757
-
758
- switch (src0t) {
759
- case GGML_TYPE_F32:
760
- {
761
- switch (dstt) {
762
- case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
763
- case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
764
- default: GGML_ASSERT(false && "not implemented");
765
- };
766
- } break;
767
- default: GGML_ASSERT(false && "not implemented");
768
- }
769
-
770
- [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
771
- [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
772
- [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
773
- [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
774
- [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
775
- [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
776
- [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
777
- [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
778
- [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
779
- [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
780
- [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
781
- [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
782
- [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
783
- [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
784
- [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
785
- [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
786
- [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
787
- [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
788
-
789
- [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
790
- } break;
791
- default:
792
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
793
- GGML_ASSERT(false);
794
- }
795
- }
796
 
797
- if (encoder != nil) {
798
- [encoder endEncoding];
799
- encoder = nil;
800
  }
801
 
802
- [command_buffer commit];
803
- [command_buffer waitUntilCompleted];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
 
805
- {
806
- const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
807
- UNUSED(time_elapsed);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
 
809
- metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
 
 
 
 
 
811
  }
 
57
  GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
  GGML_METAL_DECL_KERNEL(get_rows_q6_k);
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
+ GGML_METAL_DECL_KERNEL(norm);
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
 
67
  GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
68
  GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
69
  GGML_METAL_DECL_KERNEL(rope);
70
+ GGML_METAL_DECL_KERNEL(alibi_f32);
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
72
  GGML_METAL_DECL_KERNEL(cpy_f32_f32);
73
+ GGML_METAL_DECL_KERNEL(cpy_f16_f16);
74
 
75
  #undef GGML_METAL_DECL_KERNEL
76
  };
 
165
  GGML_METAL_ADD_KERNEL(get_rows_q5_k);
166
  GGML_METAL_ADD_KERNEL(get_rows_q6_k);
167
  GGML_METAL_ADD_KERNEL(rms_norm);
168
+ GGML_METAL_ADD_KERNEL(norm);
169
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
170
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
171
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
 
175
  GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
176
  GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
177
  GGML_METAL_ADD_KERNEL(rope);
178
+ GGML_METAL_ADD_KERNEL(alibi_f32);
179
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
180
  GGML_METAL_ADD_KERNEL(cpy_f32_f32);
181
+ GGML_METAL_ADD_KERNEL(cpy_f16_f16);
182
 
183
  #undef GGML_METAL_ADD_KERNEL
184
  }
 
290
 
291
  void ggml_metal_graph_compute(
292
  struct ggml_metal_context * ctx,
293
+ struct ggml_cgraph * gf) {
294
  metal_printf("%s: evaluating graph\n", __func__);
295
 
296
+ // create multiple command buffers and enqueue them
297
+ // then, we encode the graph into the command buffers in parallel
298
+
299
+ const int n_cb = gf->n_threads;
300
+
301
+ NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
302
+
303
+ for (int i = 0; i < n_cb; ++i) {
304
+ command_buffers[i] = [ctx->queue commandBuffer];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
+ // enqueue the command buffers in order to specify their execution order
307
+ [command_buffers[i] enqueue];
 
308
  }
309
 
310
+ // TODO: is this the best way to start threads?
311
+ dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
312
+
313
+ for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
314
+ const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
315
+
316
+ dispatch_async(queue, ^{
317
+ size_t offs_src0 = 0;
318
+ size_t offs_src1 = 0;
319
+ size_t offs_dst = 0;
320
+
321
+ id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
322
+
323
+ id<MTLComputeCommandEncoder> encoder = nil;
324
+
325
+ const int node_start = (cb_idx + 0) * n_nodes_per_cb;
326
+ const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
327
+
328
+ for (int i = node_start; i < node_end; ++i) {
329
+ metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
330
+
331
+ struct ggml_tensor * src0 = gf->nodes[i]->src0;
332
+ struct ggml_tensor * src1 = gf->nodes[i]->src1;
333
+ struct ggml_tensor * dst = gf->nodes[i];
334
+
335
+ const int64_t ne00 = src0 ? src0->ne[0] : 0;
336
+ const int64_t ne01 = src0 ? src0->ne[1] : 0;
337
+ const int64_t ne02 = src0 ? src0->ne[2] : 0;
338
+ const int64_t ne03 = src0 ? src0->ne[3] : 0;
339
+
340
+ const uint64_t nb00 = src0 ? src0->nb[0] : 0;
341
+ const uint64_t nb01 = src0 ? src0->nb[1] : 0;
342
+ const uint64_t nb02 = src0 ? src0->nb[2] : 0;
343
+ const uint64_t nb03 = src0 ? src0->nb[3] : 0;
344
+
345
+ const int64_t ne10 = src1 ? src1->ne[0] : 0;
346
+ const int64_t ne11 = src1 ? src1->ne[1] : 0;
347
+ const int64_t ne12 = src1 ? src1->ne[2] : 0;
348
+ const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
349
+
350
+ const uint64_t nb10 = src1 ? src1->nb[0] : 0;
351
+ const uint64_t nb11 = src1 ? src1->nb[1] : 0;
352
+ const uint64_t nb12 = src1 ? src1->nb[2] : 0;
353
+ const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
354
+
355
+ const int64_t ne0 = dst ? dst->ne[0] : 0;
356
+ const int64_t ne1 = dst ? dst->ne[1] : 0;
357
+ const int64_t ne2 = dst ? dst->ne[2] : 0;
358
+ const int64_t ne3 = dst ? dst->ne[3] : 0;
359
+
360
+ const uint64_t nb0 = dst ? dst->nb[0] : 0;
361
+ const uint64_t nb1 = dst ? dst->nb[1] : 0;
362
+ const uint64_t nb2 = dst ? dst->nb[2] : 0;
363
+ const uint64_t nb3 = dst ? dst->nb[3] : 0;
364
+
365
+ const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
366
+ const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
367
+ const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
368
+
369
+ id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
370
+ id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
371
+ id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
372
+
373
+ //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
374
+ //if (src0) {
375
+ // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
376
+ // ggml_is_contiguous(src0), src0->name);
377
+ //}
378
+ //if (src1) {
379
+ // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
380
+ // ggml_is_contiguous(src1), src1->name);
381
+ //}
382
+ //if (dst) {
383
+ // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
384
+ // dst->name);
385
+ //}
386
+
387
+ switch (dst->op) {
388
+ case GGML_OP_RESHAPE:
389
+ case GGML_OP_VIEW:
390
+ case GGML_OP_TRANSPOSE:
391
+ case GGML_OP_PERMUTE:
392
+ {
393
+ // noop
394
+ } break;
395
+ case GGML_OP_ADD:
396
+ {
397
+ if (encoder == nil) {
398
+ encoder = [command_buffer computeCommandEncoder];
399
+ }
400
+
401
+ [encoder setComputePipelineState:ctx->pipeline_add];
402
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
403
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
404
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
405
+
406
+ const int64_t n = ggml_nelements(dst);
407
+
408
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
409
+ } break;
410
+ case GGML_OP_MUL:
411
+ {
412
+ if (encoder == nil) {
413
+ encoder = [command_buffer computeCommandEncoder];
414
+ }
415
+
416
+ if (ggml_nelements(src1) == ne10) {
417
+ // src1 is a row
418
+ [encoder setComputePipelineState:ctx->pipeline_mul_row];
419
+ } else {
420
+ [encoder setComputePipelineState:ctx->pipeline_mul];
421
+ }
422
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
423
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
424
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
425
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
426
+
427
+ const int64_t n = ggml_nelements(dst);
428
+
429
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
430
+ } break;
431
+ case GGML_OP_SCALE:
432
+ {
433
+ if (encoder == nil) {
434
+ encoder = [command_buffer computeCommandEncoder];
435
+ }
436
+
437
+ const float scale = *(const float *) src1->data;
438
+
439
+ [encoder setComputePipelineState:ctx->pipeline_scale];
440
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
441
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
442
+ [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
443
+
444
+ const int64_t n = ggml_nelements(dst);
445
+
446
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
447
+ } break;
448
+ case GGML_OP_SILU:
449
+ {
450
+ if (encoder == nil) {
451
+ encoder = [command_buffer computeCommandEncoder];
452
+ }
453
+
454
+ [encoder setComputePipelineState:ctx->pipeline_silu];
455
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
456
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
457
+
458
+ const int64_t n = ggml_nelements(dst);
459
+
460
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
461
+ } break;
462
+ case GGML_OP_RELU:
463
+ {
464
+ if (encoder == nil) {
465
+ encoder = [command_buffer computeCommandEncoder];
466
+ }
467
+
468
+ [encoder setComputePipelineState:ctx->pipeline_relu];
469
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
470
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
471
+
472
+ const int64_t n = ggml_nelements(dst);
473
+
474
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
475
+ } break;
476
+ case GGML_OP_GELU:
477
+ {
478
+ if (encoder == nil) {
479
+ encoder = [command_buffer computeCommandEncoder];
480
+ }
481
+
482
+ [encoder setComputePipelineState:ctx->pipeline_gelu];
483
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
484
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
485
+
486
+ const int64_t n = ggml_nelements(dst);
487
+
488
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
489
+ } break;
490
+ case GGML_OP_SOFT_MAX:
491
+ {
492
+ if (encoder == nil) {
493
+ encoder = [command_buffer computeCommandEncoder];
494
+ }
495
+
496
+ const int nth = 32;
497
+
498
+ [encoder setComputePipelineState:ctx->pipeline_soft_max];
499
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
500
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
501
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
502
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
503
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
504
+ [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
505
+
506
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
507
+ } break;
508
+ case GGML_OP_DIAG_MASK_INF:
509
+ {
510
+ if (encoder == nil) {
511
+ encoder = [command_buffer computeCommandEncoder];
512
+ }
513
+
514
+ const int n_past = ((int32_t *)(src1->data))[0];
515
+
516
+ [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
517
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
518
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
519
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
520
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
521
+ [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
522
+
523
+ [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
524
+ } break;
525
+ case GGML_OP_MUL_MAT:
526
+ {
527
+ // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
528
+
529
+ GGML_ASSERT(ne00 == ne10);
530
+ GGML_ASSERT(ne02 == ne12);
531
+
532
+ if (ggml_is_contiguous(src0) &&
533
+ ggml_is_contiguous(src1) &&
534
+ (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
535
+
536
+ if (encoder != nil) {
537
+ [encoder endEncoding];
538
+ encoder = nil;
539
+ }
540
 
541
+ MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
542
+ MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
543
+
544
+ // for F32 x F32 we use MPS
545
+ MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
546
+ matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
547
+
548
+ MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
549
+ matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
550
+
551
+ MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
552
+ matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
553
+
554
+ MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
555
+ initWithDevice:ctx->device transposeLeft:false transposeRight:true
556
+ resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
557
+
558
+ // we need to do ne02 multiplications
559
+ // TODO: is there a way to do this in parallel - currently very slow ..
560
+ // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
561
+ for (int64_t i02 = 0; i02 < ne02; ++i02) {
562
+ size_t offs_src0_cur = offs_src0 + i02*nb02;
563
+ size_t offs_src1_cur = offs_src1 + i02*nb12;
564
+ size_t offs_dst_cur = offs_dst + i02*nb2;
565
 
566
+ MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
567
+ MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
568
+ MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ];
569
+
570
+ [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
571
+ }
572
+ } else {
573
+ if (encoder == nil) {
574
+ encoder = [command_buffer computeCommandEncoder];
575
+ }
576
+
577
+ int nth0 = 32;
578
+ int nth1 = 1;
579
+
580
+ // use custom matrix x vector kernel
581
+ switch (src0t) {
582
+ case GGML_TYPE_F16:
583
+ {
584
+ GGML_ASSERT(ne02 == ne12);
585
+
586
+ nth0 = 64;
587
+ nth1 = 1;
588
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
589
+ } break;
590
+ case GGML_TYPE_Q4_0:
591
+ {
592
+ GGML_ASSERT(ne02 == 1);
593
+ GGML_ASSERT(ne12 == 1);
594
+
595
+ nth0 = 8;
596
+ nth1 = 8;
597
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
598
+ } break;
599
+ case GGML_TYPE_Q4_1:
600
+ {
601
+ GGML_ASSERT(ne02 == 1);
602
+ GGML_ASSERT(ne12 == 1);
603
+
604
+ nth0 = 8;
605
+ nth1 = 8;
606
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
607
+ } break;
608
+ case GGML_TYPE_Q2_K:
609
+ {
610
+ GGML_ASSERT(ne02 == 1);
611
+ GGML_ASSERT(ne12 == 1);
612
+
613
+ nth0 = 4;
614
+ nth1 = 16;
615
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
616
+ } break;
617
+ case GGML_TYPE_Q3_K:
618
+ {
619
+ GGML_ASSERT(ne02 == 1);
620
+ GGML_ASSERT(ne12 == 1);
621
+
622
+ nth0 = 4;
623
+ nth1 = 16;
624
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
625
+ } break;
626
+ case GGML_TYPE_Q4_K:
627
+ {
628
+ GGML_ASSERT(ne02 == 1);
629
+ GGML_ASSERT(ne12 == 1);
630
+
631
+ nth0 = 4;
632
+ nth1 = 16;
633
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
634
+ } break;
635
+ case GGML_TYPE_Q5_K:
636
+ {
637
+ GGML_ASSERT(ne02 == 1);
638
+ GGML_ASSERT(ne12 == 1);
639
+
640
+ nth0 = 4;
641
+ nth1 = 16;
642
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
643
+ } break;
644
+ case GGML_TYPE_Q6_K:
645
+ {
646
+ GGML_ASSERT(ne02 == 1);
647
+ GGML_ASSERT(ne12 == 1);
648
+
649
+ nth0 = 4;
650
+ nth1 = 16;
651
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
652
+ } break;
653
+ default:
654
+ {
655
+ fprintf(stderr, "Asserting on type %d\n",(int)src0t);
656
+ GGML_ASSERT(false && "not implemented");
657
+ }
658
+ };
659
+
660
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
661
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
662
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
663
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
664
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
665
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
666
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
667
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
668
+ [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
669
+ [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
670
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
671
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
672
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
673
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
674
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
675
+
676
+ if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
677
+ [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
678
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
679
+ }
680
+ else if (src0t == GGML_TYPE_Q2_K ||
681
+ src0t == GGML_TYPE_Q3_K ||
682
+ src0t == GGML_TYPE_Q4_K ||
683
+ src0t == GGML_TYPE_Q5_K ||
684
+ src0t == GGML_TYPE_Q6_K) {
685
+ [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
686
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
687
+ } else {
688
+ [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
689
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
690
+ }
691
+ }
692
+ } break;
693
+ case GGML_OP_GET_ROWS:
694
+ {
695
+ if (encoder == nil) {
696
+ encoder = [command_buffer computeCommandEncoder];
697
+ }
698
+
699
+ switch (src0->type) {
700
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
701
+ case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
702
+ case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
703
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
704
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
705
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
706
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
707
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
708
+ default: GGML_ASSERT(false && "not implemented");
709
+ }
710
+
711
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
712
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
713
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
714
+ [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
715
+ [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
716
+ [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
717
+
718
+ const int64_t n = ggml_nelements(src1);
719
+
720
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
721
+ } break;
722
+ case GGML_OP_RMS_NORM:
723
+ {
724
+ if (encoder == nil) {
725
+ encoder = [command_buffer computeCommandEncoder];
726
+ }
727
+
728
+ const float eps = 1e-6f;
729
+
730
+ const int nth = 256;
731
+
732
+ [encoder setComputePipelineState:ctx->pipeline_rms_norm];
733
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
734
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
735
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
736
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
737
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
738
+ [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
739
+
740
+ const int64_t nrows = ggml_nrows(src0);
741
+
742
+ [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
743
+ } break;
744
+ case GGML_OP_NORM:
745
+ {
746
+ if (encoder == nil) {
747
+ encoder = [command_buffer computeCommandEncoder];
748
+ }
749
+
750
+ const float eps = 1e-5f;
751
+
752
+ const int nth = 256;
753
+
754
+ [encoder setComputePipelineState:ctx->pipeline_norm];
755
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
756
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
757
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
758
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
759
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
760
+ [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
761
+
762
+ const int64_t nrows = ggml_nrows(src0);
763
+
764
+ [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
765
+ } break;
766
+ case GGML_OP_ALIBI:
767
+ {
768
+ GGML_ASSERT((src0t == GGML_TYPE_F32));
769
+ const int n_past = ((int32_t *) src1->data)[0];
770
+ const int n_head = ((int32_t *) src1->data)[1];
771
+ const float max_bias = ((float *) src1->data)[2];
772
+ if (__builtin_popcount(n_head) != 1) {
773
+ GGML_ASSERT(false && "only power-of-two n_head implemented");
774
+ }
775
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
776
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
777
+ if (encoder == nil) {
778
+ encoder = [command_buffer computeCommandEncoder];
779
+ }
780
+ [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
781
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
782
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
783
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
784
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
785
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
786
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
787
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
788
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
789
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
790
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
791
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
792
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
793
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
794
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
795
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
796
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
797
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
798
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
799
+ [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
800
+ const int nth = 32;
801
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
802
+ } break;
803
+ case GGML_OP_ROPE:
804
+ {
805
+ if (encoder == nil) {
806
+ encoder = [command_buffer computeCommandEncoder];
807
+ }
808
+
809
+ const int n_dims = ((int32_t *) src1->data)[1];
810
+ const int mode = ((int32_t *) src1->data)[2];
811
+
812
+ const int n_past = ((int32_t *)(src1->data))[0];
813
+
814
+ [encoder setComputePipelineState:ctx->pipeline_rope];
815
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
816
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
817
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
818
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
819
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
820
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
821
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
822
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
823
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
824
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
825
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
826
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
827
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
828
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
829
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
830
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
831
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
832
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
833
+ [encoder setBytes:&n_past length:sizeof( int) atIndex:18];
834
+ [encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
835
+ [encoder setBytes:&mode length:sizeof( int) atIndex:20];
836
+
837
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
838
+ } break;
839
+ case GGML_OP_CPY:
840
+ {
841
+ if (encoder == nil) {
842
+ encoder = [command_buffer computeCommandEncoder];
843
+ }
844
+
845
+ const int nth = 32;
846
+
847
+ switch (src0t) {
848
+ case GGML_TYPE_F32:
849
+ {
850
+ switch (dstt) {
851
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
852
+ case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
853
+ default: GGML_ASSERT(false && "not implemented");
854
+ };
855
+ } break;
856
+ case GGML_TYPE_F16:
857
+ {
858
+ switch (dstt) {
859
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
860
+ case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
861
+ default: GGML_ASSERT(false && "not implemented");
862
+ };
863
+ } break;
864
+ default: GGML_ASSERT(false && "not implemented");
865
+ }
866
+
867
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
868
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
869
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
870
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
871
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
872
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
873
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
874
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
875
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
876
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
877
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
878
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
879
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
880
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
881
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
882
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
883
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
884
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
885
+
886
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
887
+ } break;
888
+ default:
889
+ fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
890
+ GGML_ASSERT(false);
891
+ }
892
+ }
893
+
894
+ if (encoder != nil) {
895
+ [encoder endEncoding];
896
+ encoder = nil;
897
+ }
898
+
899
+ [command_buffer commit];
900
+ });
901
  }
902
+
903
+ // wait for all threads to finish
904
+ dispatch_barrier_sync(queue, ^{});
905
+
906
+ [command_buffers[n_cb - 1] waitUntilCompleted];
907
  }
ggml-metal.metal CHANGED
@@ -256,6 +256,72 @@ kernel void kernel_get_rows_q4_1(
256
  (device float *) ((device char *) dst + i*nb1), ne00);
257
  }
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  kernel void kernel_rms_norm(
260
  device const void * src0,
261
  device float * dst,
@@ -485,6 +551,48 @@ kernel void kernel_mul_mat_f16_f32(
485
  }
486
  }
487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
488
  kernel void kernel_rope(
489
  device const void * src0,
490
  device float * dst,
@@ -540,6 +648,47 @@ kernel void kernel_rope(
540
  }
541
  }
542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  kernel void kernel_cpy_f32_f16(
544
  device const float * src0,
545
  device half * dst,
 
256
  (device float *) ((device char *) dst + i*nb1), ne00);
257
  }
258
 
259
+ kernel void kernel_norm(
260
+ device const void * src0,
261
+ device float * dst,
262
+ constant int64_t & ne00,
263
+ constant uint64_t & nb01,
264
+ constant float & eps,
265
+ threadgroup float * sum [[threadgroup(0)]],
266
+ uint tgpig[[threadgroup_position_in_grid]],
267
+ uint tpitg[[thread_position_in_threadgroup]],
268
+ uint ntg[[threads_per_threadgroup]]) {
269
+ device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
270
+ // MEAN
271
+ // parallel sum
272
+ sum[tpitg] = 0.0f;
273
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
274
+ sum[tpitg] += x[i00];
275
+ }
276
+ // reduce
277
+ threadgroup_barrier(mem_flags::mem_threadgroup);
278
+ for (uint i = ntg/2; i > 0; i /= 2) {
279
+ if (tpitg < i) {
280
+ sum[tpitg] += sum[tpitg + i];
281
+ }
282
+ threadgroup_barrier(mem_flags::mem_threadgroup);
283
+ }
284
+ // broadcast
285
+ if (tpitg == 0) {
286
+ sum[0] /= ne00;
287
+ }
288
+ threadgroup_barrier(mem_flags::mem_threadgroup);
289
+ const float mean = sum[0];
290
+
291
+ // recenter
292
+ device float * y = dst + tgpig*ne00;
293
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
294
+ y[i00] = x[i00] - mean;
295
+ }
296
+
297
+ // VARIANCE
298
+ // parallel sum
299
+ sum[tpitg] = 0.0f;
300
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
301
+ sum[tpitg] += y[i00] * y[i00];
302
+ }
303
+ // reduce
304
+ threadgroup_barrier(mem_flags::mem_threadgroup);
305
+ for (uint i = ntg/2; i > 0; i /= 2) {
306
+ if (tpitg < i) {
307
+ sum[tpitg] += sum[tpitg + i];
308
+ }
309
+ threadgroup_barrier(mem_flags::mem_threadgroup);
310
+ }
311
+ // broadcast
312
+ if (tpitg == 0) {
313
+ sum[0] /= ne00;
314
+ }
315
+ threadgroup_barrier(mem_flags::mem_threadgroup);
316
+ const float variance = sum[0];
317
+
318
+ const float scale = 1.0f/sqrt(variance + eps);
319
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
320
+ y[i00] = y[i00] * scale;
321
+ }
322
+ }
323
+
324
+
325
  kernel void kernel_rms_norm(
326
  device const void * src0,
327
  device float * dst,
 
551
  }
552
  }
553
 
554
+ kernel void kernel_alibi_f32(
555
+ device const float * src0,
556
+ device float * dst,
557
+ constant int64_t & ne00,
558
+ constant int64_t & ne01,
559
+ constant int64_t & ne02,
560
+ constant int64_t & ne03,
561
+ constant uint64_t & nb00,
562
+ constant uint64_t & nb01,
563
+ constant uint64_t & nb02,
564
+ constant uint64_t & nb03,
565
+ constant int64_t & ne0,
566
+ constant int64_t & ne1,
567
+ constant int64_t & ne2,
568
+ constant int64_t & ne3,
569
+ constant uint64_t & nb0,
570
+ constant uint64_t & nb1,
571
+ constant uint64_t & nb2,
572
+ constant uint64_t & nb3,
573
+ constant float & m0,
574
+ uint3 tgpig[[threadgroup_position_in_grid]],
575
+ uint3 tpitg[[thread_position_in_threadgroup]],
576
+ uint3 ntg[[threads_per_threadgroup]]) {
577
+ const int64_t i03 = tgpig[2];
578
+ const int64_t i02 = tgpig[1];
579
+ const int64_t i01 = tgpig[0];
580
+
581
+ const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
582
+
583
+ const int64_t i3 = n / (ne2*ne1*ne0);
584
+ const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
585
+ const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
586
+ const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
587
+
588
+ device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
589
+ float m_k = pow(m0, i2 + 1);
590
+ for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
591
+ device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
592
+ dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
593
+ }
594
+ }
595
+
596
  kernel void kernel_rope(
597
  device const void * src0,
598
  device float * dst,
 
648
  }
649
  }
650
 
651
+ kernel void kernel_cpy_f16_f16(
652
+ device const half * src0,
653
+ device half * dst,
654
+ constant int64_t & ne00,
655
+ constant int64_t & ne01,
656
+ constant int64_t & ne02,
657
+ constant int64_t & ne03,
658
+ constant uint64_t & nb00,
659
+ constant uint64_t & nb01,
660
+ constant uint64_t & nb02,
661
+ constant uint64_t & nb03,
662
+ constant int64_t & ne0,
663
+ constant int64_t & ne1,
664
+ constant int64_t & ne2,
665
+ constant int64_t & ne3,
666
+ constant uint64_t & nb0,
667
+ constant uint64_t & nb1,
668
+ constant uint64_t & nb2,
669
+ constant uint64_t & nb3,
670
+ uint3 tgpig[[threadgroup_position_in_grid]],
671
+ uint3 tpitg[[thread_position_in_threadgroup]],
672
+ uint3 ntg[[threads_per_threadgroup]]) {
673
+ const int64_t i03 = tgpig[2];
674
+ const int64_t i02 = tgpig[1];
675
+ const int64_t i01 = tgpig[0];
676
+
677
+ const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
678
+
679
+ const int64_t i3 = n / (ne2*ne1*ne0);
680
+ const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
681
+ const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
682
+ const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
683
+
684
+ device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
685
+
686
+ for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
687
+ device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
688
+ dst_data[i00] = src[0];
689
+ }
690
+ }
691
+
692
  kernel void kernel_cpy_f32_f16(
693
  device const float * src0,
694
  device half * dst,
ggml.c CHANGED
@@ -35,6 +35,12 @@
35
  #define static_assert(cond, msg) struct global_scope_noop_trick
36
  #endif
37
 
 
 
 
 
 
 
38
  #if defined(_WIN32)
39
 
40
  #include <windows.h>
@@ -3939,6 +3945,12 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
3939
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3940
  }
3941
 
 
 
 
 
 
 
3942
  static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
3943
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3944
 
 
35
  #define static_assert(cond, msg) struct global_scope_noop_trick
36
  #endif
37
 
38
+ #if defined(_MSC_VER)
39
+ // disable "possible loss of data" to avoid hundreds of casts
40
+ // we should just be careful :)
41
+ #pragma warning(disable: 4244 4267)
42
+ #endif
43
+
44
  #if defined(_WIN32)
45
 
46
  #include <windows.h>
 
3945
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3946
  }
3947
 
3948
+ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
3949
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3950
+
3951
+ return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
3952
+ }
3953
+
3954
  static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
3955
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3956
 
ggml.h CHANGED
@@ -485,6 +485,7 @@ extern "C" {
485
 
486
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
487
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
 
488
 
489
  // use this to compute the memory overhead of a tensor
490
  GGML_API size_t ggml_tensor_overhead(void);
 
485
 
486
  GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
487
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
488
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
489
 
490
  // use this to compute the memory overhead of a tensor
491
  GGML_API size_t ggml_tensor_overhead(void);
gpttype_adapter.cpp CHANGED
@@ -1173,6 +1173,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1173
  int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
1174
  logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
1175
  }
 
 
 
 
 
 
 
 
 
 
1176
  }
1177
 
1178
  // set the logit of the eos token (0) to minimum to avoid sampling it
@@ -1280,7 +1290,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1280
  float pt1 = (time1*1000.0/(embd_inp.size()==0?1:embd_inp.size()));
1281
  int realnpredict = params.n_predict-stopper_unused_tokens;
1282
  float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
1283
- printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs", time1, pt1, time2, pt2, (time1 + time2));
 
1284
  fflush(stdout);
1285
  output.status = 1;
1286
  generation_finished = true;
 
1173
  int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
1174
  logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
1175
  }
1176
+ else
1177
+ {
1178
+ //special case, starcoder models use ID 0 for EOS
1179
+ if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4)
1180
+ {
1181
+ eosID = 0;
1182
+ int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
1183
+ logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
1184
+ }
1185
+ }
1186
  }
1187
 
1188
  // set the logit of the eos token (0) to minimum to avoid sampling it
 
1290
  float pt1 = (time1*1000.0/(embd_inp.size()==0?1:embd_inp.size()));
1291
  int realnpredict = params.n_predict-stopper_unused_tokens;
1292
  float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
1293
+ float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
1294
+ printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)", time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
1295
  fflush(stdout);
1296
  output.status = 1;
1297
  generation_finished = true;
klite.embd CHANGED
The diff for this file is too large to render. See raw diff
 
koboldcpp.py CHANGED
@@ -224,7 +224,7 @@ maxctx = 2048
224
  maxlen = 256
225
  modelbusy = False
226
  defaultport = 5001
227
- KcppVersion = "1.30.3"
228
 
229
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
230
  sys_version = ""
@@ -415,6 +415,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
415
  self.end_headers()
416
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
417
  print("Generation Aborted")
 
418
  return
419
 
420
  if self.path.endswith('/api/extra/generate/check'):
 
224
  maxlen = 256
225
  modelbusy = False
226
  defaultport = 5001
227
+ KcppVersion = "1.31"
228
 
229
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
230
  sys_version = ""
 
415
  self.end_headers()
416
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
417
  print("Generation Aborted")
418
+ modelbusy = False
419
  return
420
 
421
  if self.path.endswith('/api/extra/generate/check'):
llama.cpp CHANGED
@@ -40,6 +40,10 @@
40
  #include <sstream>
41
  #include <numeric>
42
 
 
 
 
 
43
  #define LLAMA_USE_SCRATCH
44
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
45
 
@@ -165,6 +169,11 @@ struct llama_kv_cache {
165
  if (ctx) {
166
  ggml_free(ctx);
167
  }
 
 
 
 
 
168
  }
169
  };
170
 
@@ -210,6 +219,7 @@ struct llama_model {
210
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
211
  ggml_cuda_free_data(tensors_by_name[i].second);
212
  }
 
213
  #elif defined(GGML_USE_CLBLAST)
214
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
215
  ggml_cl_free_data(tensors_by_name[i].second);
@@ -867,7 +877,8 @@ static bool kv_cache_init(
867
  const struct llama_hparams & hparams,
868
  struct llama_kv_cache & cache,
869
  ggml_type wtype,
870
- int n_ctx) {
 
871
  const int n_embd = hparams.n_embd;
872
  const int n_layer = hparams.n_layer;
873
 
@@ -893,6 +904,15 @@ static bool kv_cache_init(
893
  ggml_set_name(cache.k, "cache_k");
894
  ggml_set_name(cache.v, "cache_v");
895
 
 
 
 
 
 
 
 
 
 
896
  return true;
897
  }
898
 
@@ -903,6 +923,7 @@ struct llama_context_params llama_context_default_params() {
903
  /*.gpu_layers =*/ 0,
904
  /*.main_gpu =*/ 0,
905
  /*.tensor_split =*/ {0},
 
906
  /*.seed =*/ -1,
907
  /*.f16_kv =*/ true,
908
  /*.logits_all =*/ false,
@@ -1011,6 +1032,7 @@ static void llama_model_load_internal(
1011
  int n_gpu_layers,
1012
  int main_gpu,
1013
  const float * tensor_split,
 
1014
  ggml_type memory_type,
1015
  bool use_mmap,
1016
  bool use_mlock,
@@ -1137,18 +1159,34 @@ static void llama_model_load_internal(
1137
  ml->ggml_ctx = ctx;
1138
 
1139
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
1140
- model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
1141
 
1142
  // "output" tensor
1143
  {
 
1144
  ggml_backend backend_output;
1145
  if (n_gpu_layers > int(n_layer)) { // NOLINT
 
 
 
 
 
 
 
 
1146
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1147
  } else {
 
1148
  backend_output = GGML_BACKEND_CPU;
1149
  }
1150
 
 
1151
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
 
 
 
 
 
 
1152
  }
1153
 
1154
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1208,22 +1246,47 @@ static void llama_model_load_internal(
1208
  (void) vram_scratch;
1209
  (void) n_batch;
1210
  #ifdef GGML_USE_CUBLAS
1211
- vram_scratch = n_batch * MB;
1212
- ggml_cuda_set_scratch_size(vram_scratch);
1213
- if (n_gpu_layers > 0) {
1214
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1215
- __func__, vram_scratch / MB);
 
 
 
 
 
1216
  }
1217
  #endif // GGML_USE_CUBLAS
1218
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1219
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1220
 
1221
- fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
1222
  if (n_gpu_layers > (int) hparams.n_layer) {
1223
- fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224
  }
 
 
 
1225
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1226
- __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
1227
  #else
1228
  (void) n_gpu_layers;
1229
  #endif
@@ -1262,6 +1325,7 @@ static bool llama_model_load(
1262
  int n_gpu_layers,
1263
  int main_gpu,
1264
  float * tensor_split,
 
1265
  ggml_type memory_type,
1266
  bool use_mmap,
1267
  bool use_mlock,
@@ -1269,7 +1333,7 @@ static bool llama_model_load(
1269
  llama_progress_callback progress_callback,
1270
  void *progress_callback_user_data) {
1271
  try {
1272
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
1273
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1274
  return true;
1275
  } catch (const std::exception & err) {
@@ -1345,12 +1409,33 @@ static bool llama_eval_internal(
1345
  const int i_gpu_start = n_layer - n_gpu_layers;
1346
  (void) i_gpu_start;
1347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1348
  for (int il = 0; il < n_layer; ++il) {
1349
  offload_func_t offload_func = llama_nop;
1350
 
1351
  #ifdef GGML_USE_CUBLAS
1352
  if (il >= i_gpu_start) {
1353
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1354
  }
1355
  #endif // GGML_USE_CUBLAS
1356
 
@@ -1373,31 +1458,42 @@ static bool llama_eval_internal(
1373
  // self-attention
1374
  {
1375
  // compute Q and K and RoPE them
1376
- struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1377
- // offload_func(tmpq);
1378
- ggml_set_name(tmpq, "tmpq");
1379
-
1380
  struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1381
- // offload_func(tmpk);
1382
  ggml_set_name(tmpk, "tmpk");
1383
 
 
 
 
 
1384
  struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
1385
  ggml_set_name(Kcur, "Kcur");
1386
 
1387
  struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
 
1388
  ggml_set_name(Qcur, "Qcur");
1389
 
1390
  // store key and value to memory
1391
  {
1392
  // compute the transposed [N, n_embd] V matrix
1393
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
 
 
 
 
 
 
1394
  ggml_set_name(Vcur, "Vcur");
1395
 
1396
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
 
1397
  ggml_set_name(k, "k");
 
1398
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1399
  ( n_ctx)*ggml_element_size(kv_self.v),
1400
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
1401
  ggml_set_name(v, "v");
1402
 
1403
  // important: storing RoPE-ed version of K in the KV cache!
@@ -1409,6 +1505,7 @@ static bool llama_eval_internal(
1409
  ggml_permute(ctx0,
1410
  Qcur,
1411
  0, 2, 1, 3);
 
1412
  ggml_set_name(Q, "Q");
1413
 
1414
  struct ggml_tensor * K =
@@ -1417,10 +1514,12 @@ static bool llama_eval_internal(
1417
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1418
  n_embd/n_head, n_head, n_past + N),
1419
  0, 2, 1, 3);
 
1420
  ggml_set_name(K, "K");
1421
 
1422
  // K * Q
1423
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
1424
  ggml_set_name(KQ, "KQ");
1425
 
1426
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
@@ -1429,14 +1528,17 @@ static bool llama_eval_internal(
1429
 
1430
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1431
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
 
1432
  ggml_set_name(KQ_scaled, "KQ_scaled");
1433
 
1434
  // KQ_masked = mask_past(KQ_scaled)
1435
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
1436
  ggml_set_name(KQ_masked, "KQ_masked");
1437
 
1438
  // KQ = soft_max(KQ_masked)
1439
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
 
1440
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1441
 
1442
  // split cached V into n_head heads
@@ -1446,10 +1548,12 @@ static bool llama_eval_internal(
1446
  n_ctx*ggml_element_size(kv_self.v),
1447
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1448
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
 
1449
  ggml_set_name(V, "V");
1450
 
1451
  #if 1
1452
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 
1453
  ggml_set_name(KQV, "KQV");
1454
  #else
1455
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@@ -1461,12 +1565,14 @@ static bool llama_eval_internal(
1461
 
1462
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1463
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 
1464
  ggml_set_name(KQV_merged, "KQV_merged");
1465
 
1466
  // cur = KQV_merged.contiguous().view(n_embd, N)
1467
  cur = ggml_cpy(ctx0,
1468
  KQV_merged,
1469
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
 
1470
  ggml_set_name(cur, "KQV_merged_contiguous");
1471
 
1472
  // projection (no bias)
@@ -1478,7 +1584,6 @@ static bool llama_eval_internal(
1478
  }
1479
 
1480
  lctx.use_buf(ctx0, 1);
1481
- //ggml_cuda_set_scratch(1);
1482
 
1483
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1484
  offload_func(inpFF);
@@ -1536,32 +1641,24 @@ static bool llama_eval_internal(
1536
  }
1537
 
1538
  lctx.use_buf(ctx0, 0);
1539
- //ggml_cuda_set_scratch(0);
1540
 
1541
  // used at the end to optionally extract the embeddings
1542
  struct ggml_tensor * embeddings = NULL;
1543
 
1544
- offload_func_t offload_func = llama_nop;
1545
-
1546
- #ifdef GGML_USE_CUBLAS
1547
- if (n_gpu_layers > n_layer) {
1548
- offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1549
- }
1550
- #endif // GGML_USE_CUBLAS
1551
 
1552
  // norm
1553
  {
1554
  cur = ggml_rms_norm(ctx0, inpL);
1555
- offload_func(cur);
1556
  ggml_set_name(cur, "rms_norm_inpL");
1557
 
1558
  cur = ggml_rms_norm(ctx0, cur);
1559
- offload_func(cur);
1560
  ggml_set_name(cur, "rms_norm_after");
1561
 
1562
  // cur = cur*norm(broadcasted)
1563
  cur = ggml_mul(ctx0, cur, model.norm);
1564
- offload_func(cur);
1565
  ggml_set_name(cur, "result_norm");
1566
 
1567
  embeddings = cur;
@@ -2552,8 +2649,8 @@ struct llama_context * llama_init_from_file(
2552
 
2553
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2554
 
2555
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2556
- params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
2557
  params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2558
  fprintf(stderr, "%s: failed to load model\n", __func__);
2559
  llama_free(ctx);
@@ -2562,7 +2659,7 @@ struct llama_context * llama_init_from_file(
2562
 
2563
  // reserve memory for context buffers
2564
  if (!params.vocab_only) {
2565
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
2566
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2567
  llama_free(ctx);
2568
  return nullptr;
 
40
  #include <sstream>
41
  #include <numeric>
42
 
43
+ #if defined(_MSC_VER)
44
+ #pragma warning(disable: 4244 4267) // possible loss of data
45
+ #endif
46
+
47
  #define LLAMA_USE_SCRATCH
48
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
49
 
 
169
  if (ctx) {
170
  ggml_free(ctx);
171
  }
172
+
173
+ #ifdef GGML_USE_CUBLAS
174
+ ggml_cuda_free_data(k);
175
+ ggml_cuda_free_data(v);
176
+ #endif // GGML_USE_CUBLAS
177
  }
178
  };
179
 
 
219
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
220
  ggml_cuda_free_data(tensors_by_name[i].second);
221
  }
222
+ ggml_cuda_free_scratch();
223
  #elif defined(GGML_USE_CLBLAST)
224
  for (size_t i = 0; i < tensors_by_name.size(); ++i) {
225
  ggml_cl_free_data(tensors_by_name[i].second);
 
877
  const struct llama_hparams & hparams,
878
  struct llama_kv_cache & cache,
879
  ggml_type wtype,
880
+ int n_ctx,
881
+ int n_gpu_layers) {
882
  const int n_embd = hparams.n_embd;
883
  const int n_layer = hparams.n_layer;
884
 
 
904
  ggml_set_name(cache.k, "cache_k");
905
  ggml_set_name(cache.v, "cache_v");
906
 
907
+ #ifdef GGML_USE_CUBLAS
908
+ if (n_gpu_layers > n_layer + 1) {
909
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
910
+ }
911
+ if (n_gpu_layers > n_layer + 2) {
912
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
913
+ }
914
+ #endif // GGML_USE_CUBLAS
915
+
916
  return true;
917
  }
918
 
 
923
  /*.gpu_layers =*/ 0,
924
  /*.main_gpu =*/ 0,
925
  /*.tensor_split =*/ {0},
926
+ /*.low_vram =*/ false,
927
  /*.seed =*/ -1,
928
  /*.f16_kv =*/ true,
929
  /*.logits_all =*/ false,
 
1032
  int n_gpu_layers,
1033
  int main_gpu,
1034
  const float * tensor_split,
1035
+ bool low_vram,
1036
  ggml_type memory_type,
1037
  bool use_mmap,
1038
  bool use_mlock,
 
1159
  ml->ggml_ctx = ctx;
1160
 
1161
  model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
1162
 
1163
  // "output" tensor
1164
  {
1165
+ ggml_backend backend_norm;
1166
  ggml_backend backend_output;
1167
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1168
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1169
+ // on Windows however this is detrimental unless everything is on the GPU
1170
+ #ifndef _WIN32
1171
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1172
+ #else
1173
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
1174
+ #endif // _WIN32
1175
+
1176
  backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
1177
  } else {
1178
+ backend_norm = GGML_BACKEND_CPU;
1179
  backend_output = GGML_BACKEND_CPU;
1180
  }
1181
 
1182
+ model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
1183
  model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
1184
+ if (backend_norm == GGML_BACKEND_GPU) {
1185
+ vram_weights += ggml_nbytes(model.norm);
1186
+ }
1187
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1188
+ vram_weights += ggml_nbytes(model.output);
1189
+ }
1190
  }
1191
 
1192
  const int i_gpu_start = n_layer - n_gpu_layers;
 
1246
  (void) vram_scratch;
1247
  (void) n_batch;
1248
  #ifdef GGML_USE_CUBLAS
1249
+ if (low_vram) {
1250
+ fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1251
+ ggml_cuda_set_scratch_size(0); // disable scratch
1252
+ } else {
1253
+ vram_scratch = n_batch * MB;
1254
+ ggml_cuda_set_scratch_size(vram_scratch);
1255
+ if (n_gpu_layers > 0) {
1256
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1257
+ __func__, vram_scratch / MB);
1258
+ }
1259
  }
1260
  #endif // GGML_USE_CUBLAS
1261
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1262
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1263
 
1264
+ fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
1265
  if (n_gpu_layers > (int) hparams.n_layer) {
1266
+ fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1267
+ }
1268
+ size_t vram_kv_cache = 0;
1269
+ if (n_gpu_layers > (int) hparams.n_layer + 1) {
1270
+ if (low_vram) {
1271
+ fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
1272
+ } else {
1273
+ fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1274
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1275
+ }
1276
+ }
1277
+ if (n_gpu_layers > (int) hparams.n_layer + 2) {
1278
+ if (low_vram) {
1279
+ fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
1280
+ } else {
1281
+ fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1282
+ vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1283
+ }
1284
  }
1285
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1286
+ fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1287
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1288
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1289
+ __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1290
  #else
1291
  (void) n_gpu_layers;
1292
  #endif
 
1325
  int n_gpu_layers,
1326
  int main_gpu,
1327
  float * tensor_split,
1328
+ bool low_vram,
1329
  ggml_type memory_type,
1330
  bool use_mmap,
1331
  bool use_mlock,
 
1333
  llama_progress_callback progress_callback,
1334
  void *progress_callback_user_data) {
1335
  try {
1336
+ llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1337
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1338
  return true;
1339
  } catch (const std::exception & err) {
 
1409
  const int i_gpu_start = n_layer - n_gpu_layers;
1410
  (void) i_gpu_start;
1411
 
1412
+ // offload functions set the tensor output backend to GPU
1413
+ // tensors are GPU-accelerated if any input or the output has been offloaded
1414
+ //
1415
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
1416
+ // in that case ggml_cuda_assign_buffers has no effect
1417
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
1418
+ offload_func_t offload_func_kq = llama_nop;
1419
+ offload_func_t offload_func_v = llama_nop;
1420
+
1421
+ #ifdef GGML_USE_CUBLAS
1422
+ if (n_gpu_layers > n_layer) {
1423
+ offload_func_nr = ggml_cuda_assign_buffers;
1424
+ }
1425
+ if (n_gpu_layers > n_layer + 1) {
1426
+ offload_func_v = ggml_cuda_assign_buffers;
1427
+ }
1428
+ if (n_gpu_layers > n_layer + 2) {
1429
+ offload_func_kq = ggml_cuda_assign_buffers;
1430
+ }
1431
+ #endif // GGML_USE_CUBLAS
1432
+
1433
  for (int il = 0; il < n_layer; ++il) {
1434
  offload_func_t offload_func = llama_nop;
1435
 
1436
  #ifdef GGML_USE_CUBLAS
1437
  if (il >= i_gpu_start) {
1438
+ offload_func = ggml_cuda_assign_buffers;
1439
  }
1440
  #endif // GGML_USE_CUBLAS
1441
 
 
1458
  // self-attention
1459
  {
1460
  // compute Q and K and RoPE them
 
 
 
 
1461
  struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1462
+ offload_func_kq(tmpk);
1463
  ggml_set_name(tmpk, "tmpk");
1464
 
1465
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1466
+ offload_func_kq(tmpq);
1467
+ ggml_set_name(tmpq, "tmpq");
1468
+
1469
  struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1470
+ offload_func_kq(Kcur);
1471
  ggml_set_name(Kcur, "Kcur");
1472
 
1473
  struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1474
+ offload_func_kq(Qcur);
1475
  ggml_set_name(Qcur, "Qcur");
1476
 
1477
  // store key and value to memory
1478
  {
1479
  // compute the transposed [N, n_embd] V matrix
1480
+
1481
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1482
+ offload_func_v(tmpv);
1483
+ ggml_set_name(tmpv, "tmpv");
1484
+
1485
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
1486
+ offload_func_v(Vcur);
1487
  ggml_set_name(Vcur, "Vcur");
1488
 
1489
  struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1490
+ offload_func_kq(k);
1491
  ggml_set_name(k, "k");
1492
+
1493
  struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
1494
  ( n_ctx)*ggml_element_size(kv_self.v),
1495
  (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1496
+ offload_func_v(v);
1497
  ggml_set_name(v, "v");
1498
 
1499
  // important: storing RoPE-ed version of K in the KV cache!
 
1505
  ggml_permute(ctx0,
1506
  Qcur,
1507
  0, 2, 1, 3);
1508
+ offload_func_kq(Q);
1509
  ggml_set_name(Q, "Q");
1510
 
1511
  struct ggml_tensor * K =
 
1514
  ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
1515
  n_embd/n_head, n_head, n_past + N),
1516
  0, 2, 1, 3);
1517
+ offload_func_kq(K);
1518
  ggml_set_name(K, "K");
1519
 
1520
  // K * Q
1521
  struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1522
+ offload_func_kq(KQ);
1523
  ggml_set_name(KQ, "KQ");
1524
 
1525
  // KQ_scaled = KQ / sqrt(n_embd/n_head)
 
1528
 
1529
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1530
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1531
+ offload_func_kq(KQ_scaled);
1532
  ggml_set_name(KQ_scaled, "KQ_scaled");
1533
 
1534
  // KQ_masked = mask_past(KQ_scaled)
1535
  struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
1536
+ offload_func_kq(KQ_masked);
1537
  ggml_set_name(KQ_masked, "KQ_masked");
1538
 
1539
  // KQ = soft_max(KQ_masked)
1540
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
1541
+ offload_func_v(KQ_soft_max);
1542
  ggml_set_name(KQ_soft_max, "KQ_soft_max");
1543
 
1544
  // split cached V into n_head heads
 
1548
  n_ctx*ggml_element_size(kv_self.v),
1549
  n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
1550
  il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
1551
+ offload_func_v(V);
1552
  ggml_set_name(V, "V");
1553
 
1554
  #if 1
1555
  struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
1556
+ offload_func_v(KQV);
1557
  ggml_set_name(KQV, "KQV");
1558
  #else
1559
  // make V contiguous in memory to speed up the matmul, however we waste time on the copy
 
1565
 
1566
  // KQV_merged = KQV.permute(0, 2, 1, 3)
1567
  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1568
+ offload_func_v(KQV_merged);
1569
  ggml_set_name(KQV_merged, "KQV_merged");
1570
 
1571
  // cur = KQV_merged.contiguous().view(n_embd, N)
1572
  cur = ggml_cpy(ctx0,
1573
  KQV_merged,
1574
  ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1575
+ offload_func_v(cur);
1576
  ggml_set_name(cur, "KQV_merged_contiguous");
1577
 
1578
  // projection (no bias)
 
1584
  }
1585
 
1586
  lctx.use_buf(ctx0, 1);
 
1587
 
1588
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1589
  offload_func(inpFF);
 
1641
  }
1642
 
1643
  lctx.use_buf(ctx0, 0);
 
1644
 
1645
  // used at the end to optionally extract the embeddings
1646
  struct ggml_tensor * embeddings = NULL;
1647
 
 
 
 
 
 
 
 
1648
 
1649
  // norm
1650
  {
1651
  cur = ggml_rms_norm(ctx0, inpL);
1652
+ offload_func_nr(cur);
1653
  ggml_set_name(cur, "rms_norm_inpL");
1654
 
1655
  cur = ggml_rms_norm(ctx0, cur);
1656
+ offload_func_nr(cur);
1657
  ggml_set_name(cur, "rms_norm_after");
1658
 
1659
  // cur = cur*norm(broadcasted)
1660
  cur = ggml_mul(ctx0, cur, model.norm);
1661
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1662
  ggml_set_name(cur, "result_norm");
1663
 
1664
  embeddings = cur;
 
2649
 
2650
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2651
 
2652
+ if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2653
+ params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2654
  params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2655
  fprintf(stderr, "%s: failed to load model\n", __func__);
2656
  llama_free(ctx);
 
2659
 
2660
  // reserve memory for context buffers
2661
  if (!params.vocab_only) {
2662
+ if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2663
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2664
  llama_free(ctx);
2665
  return nullptr;
llama.h CHANGED
@@ -77,6 +77,7 @@ extern "C" {
77
  int n_gpu_layers; // number of layers to store in VRAM
78
  int main_gpu; // the GPU that is used for scratch and small tensors
79
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
 
80
  int seed; // RNG seed, -1 for random
81
 
82
  bool f16_kv; // use fp16 for KV cache
@@ -243,9 +244,9 @@ extern "C" {
243
  LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
244
 
245
  // Special tokens
246
- LLAMA_API llama_token llama_token_bos();
247
- LLAMA_API llama_token llama_token_eos();
248
- LLAMA_API llama_token llama_token_nl();
249
 
250
  // Sampling functions
251
 
 
77
  int n_gpu_layers; // number of layers to store in VRAM
78
  int main_gpu; // the GPU that is used for scratch and small tensors
79
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
81
  int seed; // RNG seed, -1 for random
82
 
83
  bool f16_kv; // use fp16 for KV cache
 
244
  LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
245
 
246
  // Special tokens
247
+ LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
248
+ LLAMA_API llama_token llama_token_eos(); // end-of-sentence
249
+ LLAMA_API llama_token llama_token_nl(); // next-line
250
 
251
  // Sampling functions
252
 
spm-headers/ggml.h ADDED
@@ -0,0 +1,1319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ //
4
+ // GGML Tensor Library
5
+ //
6
+ // This documentation is still a work in progress.
7
+ // If you wish some specific topics to be covered, feel free to drop a comment:
8
+ //
9
+ // https://github.com/ggerganov/whisper.cpp/issues/40
10
+ //
11
+ // ## Overview
12
+ //
13
+ // This library implements:
14
+ //
15
+ // - a set of tensor operations
16
+ // - automatic differentiation
17
+ // - basic optimization algorithms
18
+ //
19
+ // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20
+ // but is not limited to, the following:
21
+ //
22
+ // - linear regression
23
+ // - support vector machines
24
+ // - neural networks
25
+ //
26
+ // The library allows the user to define a certain function using the available tensor operations. This function
27
+ // definition is represented internally via a computation graph. Each tensor operation in the function definition
28
+ // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29
+ // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30
+ // using one of the available optimization algorithms.
31
+ //
32
+ // For example, here we define the function: f(x) = a*x^2 + b
33
+ //
34
+ // {
35
+ // struct ggml_init_params params = {
36
+ // .mem_size = 16*1024*1024,
37
+ // .mem_buffer = NULL,
38
+ // };
39
+ //
40
+ // // memory allocation happens here
41
+ // struct ggml_context * ctx = ggml_init(params);
42
+ //
43
+ // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
44
+ //
45
+ // ggml_set_param(ctx, x); // x is an input variable
46
+ //
47
+ // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
48
+ // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
49
+ // struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
50
+ // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
51
+ //
52
+ // ...
53
+ // }
54
+ //
55
+ // Notice that the function definition above does not involve any actual computation. The computation is performed only
56
+ // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57
+ //
58
+ // {
59
+ // ...
60
+ //
61
+ // struct ggml_cgraph gf = ggml_build_forward(f);
62
+ //
63
+ // // set the input variable and parameter values
64
+ // ggml_set_f32(x, 2.0f);
65
+ // ggml_set_f32(a, 3.0f);
66
+ // ggml_set_f32(b, 4.0f);
67
+ //
68
+ // ggml_graph_compute(ctx0, &gf);
69
+ //
70
+ // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
+ //
72
+ // ...
73
+ // }
74
+ //
75
+ // The actual computation is performed in the ggml_graph_compute() function.
76
+ //
77
+ // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
78
+ // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
79
+ // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
80
+ // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
81
+ // actually needed.
82
+ //
83
+ // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
84
+ // differentiation and optimization algorithms.
85
+ //
86
+ // The described approach allows to define the function graph once and then compute its forward or backward graphs
87
+ // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
88
+ // the user can avoid the memory allocation overhead at runtime.
89
+ //
90
+ // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
91
+ // citizens, but in theory the library can be extended to support FP8 and integer data types.
92
+ //
93
+ // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
94
+ // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
95
+ // clear that the library needs to support more complex operations. The way to support these operations is not clear
96
+ // yet, but a few examples are demonstrated in the following operations:
97
+ //
98
+ // - ggml_permute()
99
+ // - ggml_conv_1d_1s()
100
+ // - ggml_conv_1d_2s()
101
+ //
102
+ // For each tensor operator, the library implements a forward and backward computation function. The forward function
103
+ // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
104
+ // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
105
+ // calculus class, or watch the following video:
106
+ //
107
+ // What is Automatic Differentiation?
108
+ // https://www.youtube.com/watch?v=wG_nF1awSSY
109
+ //
110
+ //
111
+ // ## Tensor data (struct ggml_tensor)
112
+ //
113
+ // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
114
+ // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
115
+ // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
116
+ //
117
+ // {
118
+ // struct ggml_tensor * c = ggml_add(ctx, a, b);
119
+ //
120
+ // assert(c->src[0] == a);
121
+ // assert(c->src[1] == b);
122
+ // }
123
+ //
124
+ // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
125
+ // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
126
+ // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
127
+ // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
128
+ // contiguous in memory.
129
+ //
130
+ // The data of the tensor is accessed via the "data" pointer. For example:
131
+ //
132
+ // {
133
+ // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
+ //
135
+ // // a[1, 2] = 1.0f;
136
+ // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
+ //
138
+ // // a[2, 0] = 2.0f;
139
+ // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
+ //
141
+ // ...
142
+ // }
143
+ //
144
+ // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
145
+ //
146
+ // ## The matrix multiplication operator (ggml_mul_mat)
147
+ //
148
+ // TODO
149
+ //
150
+ //
151
+ // ## Multi-threading
152
+ //
153
+ // TODO
154
+ //
155
+ //
156
+ // ## Overview of ggml.c
157
+ //
158
+ // TODO
159
+ //
160
+ //
161
+ // ## SIMD optimizations
162
+ //
163
+ // TODO
164
+ //
165
+ //
166
+ // ## Debugging ggml
167
+ //
168
+ // TODO
169
+ //
170
+ //
171
+
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
184
+ #endif
185
+
186
+ #include <stdint.h>
187
+ #include <stddef.h>
188
+ #include <stdbool.h>
189
+
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
193
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
194
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
195
+
196
+ #define GGML_MAX_DIMS 4
197
+ #define GGML_MAX_NODES 4096
198
+ #define GGML_MAX_PARAMS 256
199
+ #define GGML_MAX_CONTEXTS 64
200
+ #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
202
+ #define GGML_DEFAULT_N_THREADS 4
203
+
204
+ #define GGML_ASSERT(x) \
205
+ do { \
206
+ if (!(x)) { \
207
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
208
+ abort(); \
209
+ } \
210
+ } while (0)
211
+
212
+ #ifdef __cplusplus
213
+ extern "C" {
214
+ #endif
215
+
216
+ #ifdef __ARM_NEON
217
+ // we use the built-in 16-bit float type
218
+ typedef __fp16 ggml_fp16_t;
219
+ #else
220
+ typedef uint16_t ggml_fp16_t;
221
+ #endif
222
+
223
+ // convert FP16 <-> FP32
224
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
+
227
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
229
+
230
+ struct ggml_object;
231
+ struct ggml_context;
232
+
233
+ enum ggml_type {
234
+ GGML_TYPE_F32 = 0,
235
+ GGML_TYPE_F16 = 1,
236
+ GGML_TYPE_Q4_0 = 2,
237
+ GGML_TYPE_Q4_1 = 3,
238
+ // GGML_TYPE_Q4_2 = 4, support has been removed
239
+ // GGML_TYPE_Q4_3 (5) support has been removed
240
+ GGML_TYPE_Q5_0 = 6,
241
+ GGML_TYPE_Q5_1 = 7,
242
+ GGML_TYPE_Q8_0 = 8,
243
+ GGML_TYPE_Q8_1 = 9,
244
+ // k-quantizations
245
+ GGML_TYPE_Q2_K = 10,
246
+ GGML_TYPE_Q3_K = 11,
247
+ GGML_TYPE_Q4_K = 12,
248
+ GGML_TYPE_Q5_K = 13,
249
+ GGML_TYPE_Q6_K = 14,
250
+ GGML_TYPE_Q8_K = 15,
251
+ GGML_TYPE_I8,
252
+ GGML_TYPE_I16,
253
+ GGML_TYPE_I32,
254
+ GGML_TYPE_COUNT,
255
+ };
256
+
257
+ enum ggml_backend {
258
+ GGML_BACKEND_CPU = 0,
259
+ GGML_BACKEND_GPU = 10,
260
+ GGML_BACKEND_GPU_SPLIT = 20,
261
+ };
262
+
263
+ // model file types
264
+ enum ggml_ftype {
265
+ GGML_FTYPE_UNKNOWN = -1,
266
+ GGML_FTYPE_ALL_F32 = 0,
267
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
268
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
269
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
270
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
271
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
272
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
273
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
274
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
275
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
276
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
277
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
278
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
279
+ };
280
+
281
+ // available tensor operations:
282
+ enum ggml_op {
283
+ GGML_OP_NONE = 0,
284
+
285
+ GGML_OP_DUP,
286
+ GGML_OP_ADD,
287
+ GGML_OP_ADD1,
288
+ GGML_OP_ACC,
289
+ GGML_OP_SUB,
290
+ GGML_OP_MUL,
291
+ GGML_OP_DIV,
292
+ GGML_OP_SQR,
293
+ GGML_OP_SQRT,
294
+ GGML_OP_LOG,
295
+ GGML_OP_SUM,
296
+ GGML_OP_SUM_ROWS,
297
+ GGML_OP_MEAN,
298
+ GGML_OP_REPEAT,
299
+ GGML_OP_REPEAT_BACK,
300
+ GGML_OP_ABS,
301
+ GGML_OP_SGN,
302
+ GGML_OP_NEG,
303
+ GGML_OP_STEP,
304
+ GGML_OP_RELU,
305
+ GGML_OP_GELU,
306
+ GGML_OP_SILU,
307
+ GGML_OP_SILU_BACK,
308
+ GGML_OP_NORM, // normalize
309
+ GGML_OP_RMS_NORM,
310
+ GGML_OP_RMS_NORM_BACK,
311
+
312
+ GGML_OP_MUL_MAT,
313
+ GGML_OP_OUT_PROD,
314
+
315
+ GGML_OP_SCALE,
316
+ GGML_OP_SET,
317
+ GGML_OP_CPY,
318
+ GGML_OP_CONT,
319
+ GGML_OP_RESHAPE,
320
+ GGML_OP_VIEW,
321
+ GGML_OP_PERMUTE,
322
+ GGML_OP_TRANSPOSE,
323
+ GGML_OP_GET_ROWS,
324
+ GGML_OP_GET_ROWS_BACK,
325
+ GGML_OP_DIAG,
326
+ GGML_OP_DIAG_MASK_INF,
327
+ GGML_OP_DIAG_MASK_ZERO,
328
+ GGML_OP_SOFT_MAX,
329
+ GGML_OP_SOFT_MAX_BACK,
330
+ GGML_OP_ROPE,
331
+ GGML_OP_ROPE_BACK,
332
+ GGML_OP_ALIBI,
333
+ GGML_OP_CLAMP,
334
+ GGML_OP_CONV_1D_1S,
335
+ GGML_OP_CONV_1D_2S,
336
+
337
+ GGML_OP_FLASH_ATTN,
338
+ GGML_OP_FLASH_FF,
339
+ GGML_OP_FLASH_ATTN_BACK,
340
+
341
+ GGML_OP_MAP_UNARY,
342
+ GGML_OP_MAP_BINARY,
343
+
344
+ GGML_OP_CROSS_ENTROPY_LOSS,
345
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
346
+
347
+ GGML_OP_COUNT,
348
+ };
349
+
350
+
351
+ // ggml object
352
+ struct ggml_object {
353
+ size_t offs;
354
+ size_t size;
355
+
356
+ struct ggml_object * next;
357
+
358
+ char padding[8];
359
+ };
360
+
361
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
362
+
363
+ // n-dimensional tensor
364
+ struct ggml_tensor {
365
+ enum ggml_type type;
366
+ enum ggml_backend backend;
367
+
368
+ int n_dims;
369
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
370
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
371
+ // nb[0] = sizeof(type)
372
+ // nb[1] = nb[0] * ne[0] + padding
373
+ // nb[i] = nb[i-1] * ne[i-1]
374
+
375
+ // compute data
376
+ enum ggml_op op;
377
+
378
+ bool is_param;
379
+
380
+ struct ggml_tensor * grad;
381
+ struct ggml_tensor * src0;
382
+ struct ggml_tensor * src1;
383
+ struct ggml_tensor * opt[GGML_MAX_OPT];
384
+
385
+ // thread scheduling
386
+ int n_tasks;
387
+
388
+ // performance
389
+ int perf_runs;
390
+ int64_t perf_cycles;
391
+ int64_t perf_time_us;
392
+
393
+ void * data;
394
+
395
+ char name[GGML_MAX_NAME];
396
+
397
+ void * extra; // extra things e.g. for ggml-cuda.cu
398
+
399
+ char padding[4];
400
+ };
401
+
402
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
403
+
404
+ // computation graph
405
+ struct ggml_cgraph {
406
+ int n_nodes;
407
+ int n_leafs;
408
+ int n_threads;
409
+
410
+ size_t work_size;
411
+ struct ggml_tensor * work;
412
+
413
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
414
+ struct ggml_tensor * grads[GGML_MAX_NODES];
415
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
416
+
417
+ // performance
418
+ int perf_runs;
419
+ int64_t perf_cycles;
420
+ int64_t perf_time_us;
421
+ };
422
+
423
+ // scratch buffer
424
+ struct ggml_scratch {
425
+ size_t offs;
426
+ size_t size;
427
+ void * data;
428
+ };
429
+
430
+ struct ggml_init_params {
431
+ // memory pool
432
+ size_t mem_size; // bytes
433
+ void * mem_buffer; // if NULL, memory will be allocated internally
434
+ bool no_alloc; // don't allocate memory for the tensor data
435
+ };
436
+
437
+
438
+ // compute types
439
+ enum ggml_task_type {
440
+ GGML_TASK_INIT = 0,
441
+ GGML_TASK_COMPUTE,
442
+ GGML_TASK_FINALIZE,
443
+ };
444
+
445
+ struct ggml_compute_params {
446
+ enum ggml_task_type type;
447
+
448
+ // ith = thread index, nth = number of threads
449
+ int ith, nth;
450
+
451
+ // work buffer for all threads
452
+ size_t wsize;
453
+ void * wdata;
454
+ };
455
+
456
+ // misc
457
+
458
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
459
+ GGML_API int64_t ggml_time_ms(void);
460
+ GGML_API int64_t ggml_time_us(void);
461
+ GGML_API int64_t ggml_cycles(void);
462
+ GGML_API int64_t ggml_cycles_per_ms(void);
463
+
464
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
465
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
466
+
467
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
468
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
469
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
470
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
471
+
472
+ GGML_API int ggml_blck_size (enum ggml_type type);
473
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
474
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
475
+
476
+ GGML_API const char * ggml_type_name(enum ggml_type type);
477
+ GGML_API const char * ggml_op_name (enum ggml_op op);
478
+
479
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
480
+
481
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
482
+
483
+ // TODO: temporary until model loading of ggml examples is refactored
484
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
485
+
486
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
487
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
488
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
489
+
490
+ // use this to compute the memory overhead of a tensor
491
+ GGML_API size_t ggml_tensor_overhead(void);
492
+
493
+ // main
494
+
495
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
496
+ GGML_API void ggml_free(struct ggml_context * ctx);
497
+
498
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
499
+
500
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502
+
503
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
505
+
506
+ GGML_API struct ggml_tensor * ggml_new_tensor(
507
+ struct ggml_context * ctx,
508
+ enum ggml_type type,
509
+ int n_dims,
510
+ const int64_t *ne);
511
+
512
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
513
+ struct ggml_context * ctx,
514
+ enum ggml_type type,
515
+ int64_t ne0);
516
+
517
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
518
+ struct ggml_context * ctx,
519
+ enum ggml_type type,
520
+ int64_t ne0,
521
+ int64_t ne1);
522
+
523
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
524
+ struct ggml_context * ctx,
525
+ enum ggml_type type,
526
+ int64_t ne0,
527
+ int64_t ne1,
528
+ int64_t ne2);
529
+
530
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
531
+ struct ggml_context * ctx,
532
+ enum ggml_type type,
533
+ int64_t ne0,
534
+ int64_t ne1,
535
+ int64_t ne2,
536
+ int64_t ne3);
537
+
538
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
539
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
540
+
541
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
542
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
543
+
544
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
545
+
546
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
547
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
548
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
549
+
550
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
551
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
552
+
553
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
554
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
555
+
556
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
557
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
558
+
559
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
560
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
561
+
562
+ //
563
+ // operations on tensors with backpropagation
564
+ //
565
+
566
+ GGML_API struct ggml_tensor * ggml_dup(
567
+ struct ggml_context * ctx,
568
+ struct ggml_tensor * a);
569
+
570
+ GGML_API struct ggml_tensor * ggml_add(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a,
573
+ struct ggml_tensor * b);
574
+
575
+ GGML_API struct ggml_tensor * ggml_add_inplace(
576
+ struct ggml_context * ctx,
577
+ struct ggml_tensor * a,
578
+ struct ggml_tensor * b);
579
+
580
+ GGML_API struct ggml_tensor * ggml_add1(
581
+ struct ggml_context * ctx,
582
+ struct ggml_tensor * a,
583
+ struct ggml_tensor * b);
584
+
585
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
586
+ struct ggml_context * ctx,
587
+ struct ggml_tensor * a,
588
+ struct ggml_tensor * b);
589
+
590
+ GGML_API struct ggml_tensor * ggml_acc(
591
+ struct ggml_context * ctx,
592
+ struct ggml_tensor * a,
593
+ struct ggml_tensor * b,
594
+ size_t nb1,
595
+ size_t nb2,
596
+ size_t nb3,
597
+ size_t offset);
598
+
599
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
600
+ struct ggml_context * ctx,
601
+ struct ggml_tensor * a,
602
+ struct ggml_tensor * b,
603
+ size_t nb1,
604
+ size_t nb2,
605
+ size_t nb3,
606
+ size_t offset);
607
+
608
+ GGML_API struct ggml_tensor * ggml_sub(
609
+ struct ggml_context * ctx,
610
+ struct ggml_tensor * a,
611
+ struct ggml_tensor * b);
612
+
613
+ GGML_API struct ggml_tensor * ggml_mul(
614
+ struct ggml_context * ctx,
615
+ struct ggml_tensor * a,
616
+ struct ggml_tensor * b);
617
+
618
+ GGML_API struct ggml_tensor * ggml_div(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ struct ggml_tensor * b);
622
+
623
+ GGML_API struct ggml_tensor * ggml_sqr(
624
+ struct ggml_context * ctx,
625
+ struct ggml_tensor * a);
626
+
627
+ GGML_API struct ggml_tensor * ggml_sqrt(
628
+ struct ggml_context * ctx,
629
+ struct ggml_tensor * a);
630
+
631
+ GGML_API struct ggml_tensor * ggml_log(
632
+ struct ggml_context * ctx,
633
+ struct ggml_tensor * a);
634
+
635
+ GGML_API struct ggml_tensor * ggml_log_inplace(
636
+ struct ggml_context * ctx,
637
+ struct ggml_tensor * a);
638
+
639
+ // return scalar
640
+ GGML_API struct ggml_tensor * ggml_sum(
641
+ struct ggml_context * ctx,
642
+ struct ggml_tensor * a);
643
+
644
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
645
+ GGML_API struct ggml_tensor * ggml_sum_rows(
646
+ struct ggml_context * ctx,
647
+ struct ggml_tensor * a);
648
+
649
+ // mean along rows
650
+ GGML_API struct ggml_tensor * ggml_mean(
651
+ struct ggml_context * ctx,
652
+ struct ggml_tensor * a);
653
+
654
+ // if a is the same shape as b, and a is not parameter, return a
655
+ // otherwise, return a new tensor: repeat(a) to fit in b
656
+ GGML_API struct ggml_tensor * ggml_repeat(
657
+ struct ggml_context * ctx,
658
+ struct ggml_tensor * a,
659
+ struct ggml_tensor * b);
660
+
661
+ GGML_API struct ggml_tensor * ggml_repeat_back(
662
+ struct ggml_context * ctx,
663
+ struct ggml_tensor * a,
664
+ struct ggml_tensor * b);
665
+
666
+ GGML_API struct ggml_tensor * ggml_abs(
667
+ struct ggml_context * ctx,
668
+ struct ggml_tensor * a);
669
+
670
+ GGML_API struct ggml_tensor * ggml_sgn(
671
+ struct ggml_context * ctx,
672
+ struct ggml_tensor * a);
673
+
674
+ GGML_API struct ggml_tensor * ggml_neg(
675
+ struct ggml_context * ctx,
676
+ struct ggml_tensor * a);
677
+
678
+ GGML_API struct ggml_tensor * ggml_step(
679
+ struct ggml_context * ctx,
680
+ struct ggml_tensor * a);
681
+
682
+ GGML_API struct ggml_tensor * ggml_relu(
683
+ struct ggml_context * ctx,
684
+ struct ggml_tensor * a);
685
+
686
+ // TODO: double-check this computation is correct
687
+ GGML_API struct ggml_tensor * ggml_gelu(
688
+ struct ggml_context * ctx,
689
+ struct ggml_tensor * a);
690
+
691
+ GGML_API struct ggml_tensor * ggml_silu(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a);
694
+
695
+ // a - x
696
+ // b - dy
697
+ GGML_API struct ggml_tensor * ggml_silu_back(
698
+ struct ggml_context * ctx,
699
+ struct ggml_tensor * a,
700
+ struct ggml_tensor * b);
701
+
702
+ // normalize along rows
703
+ // TODO: eps is hardcoded to 1e-5 for now
704
+ GGML_API struct ggml_tensor * ggml_norm(
705
+ struct ggml_context * ctx,
706
+ struct ggml_tensor * a);
707
+
708
+ GGML_API struct ggml_tensor * ggml_rms_norm(
709
+ struct ggml_context * ctx,
710
+ struct ggml_tensor * a);
711
+
712
+ // a - x
713
+ // b - dy
714
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
715
+ struct ggml_context * ctx,
716
+ struct ggml_tensor * a,
717
+ struct ggml_tensor * b);
718
+
719
+ // A: n columns, m rows
720
+ // B: n columns, p rows (i.e. we transpose it internally)
721
+ // result is m columns, p rows
722
+ GGML_API struct ggml_tensor * ggml_mul_mat(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a,
725
+ struct ggml_tensor * b);
726
+
727
+ // A: m columns, n rows,
728
+ // B: p columns, n rows,
729
+ // result is m columns, p rows
730
+ GGML_API struct ggml_tensor * ggml_out_prod(
731
+ struct ggml_context * ctx,
732
+ struct ggml_tensor * a,
733
+ struct ggml_tensor * b);
734
+
735
+ //
736
+ // operations on tensors without backpropagation
737
+ //
738
+
739
+ GGML_API struct ggml_tensor * ggml_scale(
740
+ struct ggml_context * ctx,
741
+ struct ggml_tensor * a,
742
+ struct ggml_tensor * b);
743
+
744
+ // in-place, returns view(a)
745
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
746
+ struct ggml_context * ctx,
747
+ struct ggml_tensor * a,
748
+ struct ggml_tensor * b);
749
+
750
+ // b -> view(a,offset,nb1,nb2,3), return modified a
751
+ GGML_API struct ggml_tensor * ggml_set(
752
+ struct ggml_context * ctx,
753
+ struct ggml_tensor * a,
754
+ struct ggml_tensor * b,
755
+ size_t nb1,
756
+ size_t nb2,
757
+ size_t nb3,
758
+ size_t offset);
759
+
760
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
761
+ GGML_API struct ggml_tensor * ggml_set_inplace(
762
+ struct ggml_context * ctx,
763
+ struct ggml_tensor * a,
764
+ struct ggml_tensor * b,
765
+ size_t nb1,
766
+ size_t nb2,
767
+ size_t nb3,
768
+ size_t offset);
769
+
770
+ GGML_API struct ggml_tensor * ggml_set_1d(
771
+ struct ggml_context * ctx,
772
+ struct ggml_tensor * a,
773
+ struct ggml_tensor * b,
774
+ size_t offset);
775
+
776
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
777
+ struct ggml_context * ctx,
778
+ struct ggml_tensor * a,
779
+ struct ggml_tensor * b,
780
+ size_t offset);
781
+
782
+ // b -> view(a,offset,nb1,nb2,3), return modified a
783
+ GGML_API struct ggml_tensor * ggml_set_2d(
784
+ struct ggml_context * ctx,
785
+ struct ggml_tensor * a,
786
+ struct ggml_tensor * b,
787
+ size_t nb1,
788
+ size_t offset);
789
+
790
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
791
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
792
+ struct ggml_context * ctx,
793
+ struct ggml_tensor * a,
794
+ struct ggml_tensor * b,
795
+ size_t nb1,
796
+ size_t offset);
797
+
798
+
799
+ // a -> b, return view(b)
800
+ GGML_API struct ggml_tensor * ggml_cpy(
801
+ struct ggml_context * ctx,
802
+ struct ggml_tensor * a,
803
+ struct ggml_tensor * b);
804
+
805
+ // make contiguous
806
+ GGML_API struct ggml_tensor * ggml_cont(
807
+ struct ggml_context * ctx,
808
+ struct ggml_tensor * a);
809
+
810
+ // return view(a), b specifies the new shape
811
+ // TODO: when we start computing gradient, make a copy instead of view
812
+ GGML_API struct ggml_tensor * ggml_reshape(
813
+ struct ggml_context * ctx,
814
+ struct ggml_tensor * a,
815
+ struct ggml_tensor * b);
816
+
817
+ // return view(a)
818
+ // TODO: when we start computing gradient, make a copy instead of view
819
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
820
+ struct ggml_context * ctx,
821
+ struct ggml_tensor * a,
822
+ int64_t ne0);
823
+
824
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
825
+ struct ggml_context * ctx,
826
+ struct ggml_tensor * a,
827
+ int64_t ne0,
828
+ int64_t ne1);
829
+
830
+ // return view(a)
831
+ // TODO: when we start computing gradient, make a copy instead of view
832
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
833
+ struct ggml_context * ctx,
834
+ struct ggml_tensor * a,
835
+ int64_t ne0,
836
+ int64_t ne1,
837
+ int64_t ne2);
838
+
839
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
840
+ struct ggml_context * ctx,
841
+ struct ggml_tensor * a,
842
+ int64_t ne0,
843
+ int64_t ne1,
844
+ int64_t ne2,
845
+ int64_t ne3);
846
+
847
+ // offset in bytes
848
+ GGML_API struct ggml_tensor * ggml_view_1d(
849
+ struct ggml_context * ctx,
850
+ struct ggml_tensor * a,
851
+ int64_t ne0,
852
+ size_t offset);
853
+
854
+ GGML_API struct ggml_tensor * ggml_view_2d(
855
+ struct ggml_context * ctx,
856
+ struct ggml_tensor * a,
857
+ int64_t ne0,
858
+ int64_t ne1,
859
+ size_t nb1, // row stride in bytes
860
+ size_t offset);
861
+
862
+ GGML_API struct ggml_tensor * ggml_view_3d(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a,
865
+ int64_t ne0,
866
+ int64_t ne1,
867
+ int64_t ne2,
868
+ size_t nb1, // row stride in bytes
869
+ size_t nb2, // slice stride in bytes
870
+ size_t offset);
871
+
872
+ GGML_API struct ggml_tensor * ggml_view_4d(
873
+ struct ggml_context * ctx,
874
+ struct ggml_tensor * a,
875
+ int64_t ne0,
876
+ int64_t ne1,
877
+ int64_t ne2,
878
+ int64_t ne3,
879
+ size_t nb1, // row stride in bytes
880
+ size_t nb2, // slice stride in bytes
881
+ size_t nb3,
882
+ size_t offset);
883
+
884
+ GGML_API struct ggml_tensor * ggml_permute(
885
+ struct ggml_context * ctx,
886
+ struct ggml_tensor * a,
887
+ int axis0,
888
+ int axis1,
889
+ int axis2,
890
+ int axis3);
891
+
892
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
893
+ GGML_API struct ggml_tensor * ggml_transpose(
894
+ struct ggml_context * ctx,
895
+ struct ggml_tensor * a);
896
+
897
+ GGML_API struct ggml_tensor * ggml_get_rows(
898
+ struct ggml_context * ctx,
899
+ struct ggml_tensor * a,
900
+ struct ggml_tensor * b);
901
+
902
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
903
+ struct ggml_context * ctx,
904
+ struct ggml_tensor * a,
905
+ struct ggml_tensor * b,
906
+ struct ggml_tensor * c);
907
+
908
+ GGML_API struct ggml_tensor * ggml_diag(
909
+ struct ggml_context * ctx,
910
+ struct ggml_tensor * a);
911
+
912
+ // set elements above the diagonal to -INF
913
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
914
+ struct ggml_context * ctx,
915
+ struct ggml_tensor * a,
916
+ int n_past);
917
+
918
+ // in-place, returns view(a)
919
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
920
+ struct ggml_context * ctx,
921
+ struct ggml_tensor * a,
922
+ int n_past);
923
+
924
+ // set elements above the diagonal to 0
925
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
926
+ struct ggml_context * ctx,
927
+ struct ggml_tensor * a,
928
+ int n_past);
929
+
930
+ // in-place, returns view(a)
931
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
932
+ struct ggml_context * ctx,
933
+ struct ggml_tensor * a,
934
+ int n_past);
935
+
936
+ GGML_API struct ggml_tensor * ggml_soft_max(
937
+ struct ggml_context * ctx,
938
+ struct ggml_tensor * a);
939
+
940
+ // in-place, returns view(a)
941
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
942
+ struct ggml_context * ctx,
943
+ struct ggml_tensor * a);
944
+
945
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
946
+ struct ggml_context * ctx,
947
+ struct ggml_tensor * a,
948
+ struct ggml_tensor * b);
949
+
950
+ // in-place, returns view(a)
951
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
952
+ struct ggml_context * ctx,
953
+ struct ggml_tensor * a,
954
+ struct ggml_tensor * b);
955
+
956
+ // rotary position embedding
957
+ // if mode & 1 == 1, skip n_past elements
958
+ // if mode & 2 == 1, GPT-NeoX style
959
+ // TODO: avoid creating a new tensor every time
960
+ GGML_API struct ggml_tensor * ggml_rope(
961
+ struct ggml_context * ctx,
962
+ struct ggml_tensor * a,
963
+ int n_past,
964
+ int n_dims,
965
+ int mode);
966
+
967
+ // in-place, returns view(a)
968
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
969
+ struct ggml_context * ctx,
970
+ struct ggml_tensor * a,
971
+ int n_past,
972
+ int n_dims,
973
+ int mode);
974
+
975
+ // rotary position embedding backward, i.e compute dx from dy
976
+ // a - dy
977
+ GGML_API struct ggml_tensor * ggml_rope_back(
978
+ struct ggml_context * ctx,
979
+ struct ggml_tensor * a,
980
+ int n_past,
981
+ int n_dims,
982
+ int mode);
983
+
984
+ // alibi position embedding
985
+ // in-place, returns view(a)
986
+ struct ggml_tensor * ggml_alibi(
987
+ struct ggml_context * ctx,
988
+ struct ggml_tensor * a,
989
+ int n_past,
990
+ int n_head,
991
+ float bias_max);
992
+
993
+ // clamp
994
+ // in-place, returns view(a)
995
+ struct ggml_tensor * ggml_clamp(
996
+ struct ggml_context * ctx,
997
+ struct ggml_tensor * a,
998
+ float min,
999
+ float max);
1000
+
1001
+ // padding = 1
1002
+ // TODO: we don't support extra parameters for now
1003
+ // that's why we are hard-coding the stride, padding, and dilation
1004
+ // not great ..
1005
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
1006
+ struct ggml_context * ctx,
1007
+ struct ggml_tensor * a,
1008
+ struct ggml_tensor * b);
1009
+
1010
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
1011
+ struct ggml_context * ctx,
1012
+ struct ggml_tensor * a,
1013
+ struct ggml_tensor * b);
1014
+
1015
+ GGML_API struct ggml_tensor * ggml_flash_attn(
1016
+ struct ggml_context * ctx,
1017
+ struct ggml_tensor * q,
1018
+ struct ggml_tensor * k,
1019
+ struct ggml_tensor * v,
1020
+ bool masked);
1021
+
1022
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1023
+ struct ggml_context * ctx,
1024
+ struct ggml_tensor * q,
1025
+ struct ggml_tensor * k,
1026
+ struct ggml_tensor * v,
1027
+ struct ggml_tensor * d,
1028
+ bool masked);
1029
+
1030
+ GGML_API struct ggml_tensor * ggml_flash_ff(
1031
+ struct ggml_context * ctx,
1032
+ struct ggml_tensor * a,
1033
+ struct ggml_tensor * b0,
1034
+ struct ggml_tensor * b1,
1035
+ struct ggml_tensor * c0,
1036
+ struct ggml_tensor * c1);
1037
+
1038
+ // Mapping operations
1039
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1040
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1041
+
1042
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
1043
+ struct ggml_context * ctx,
1044
+ struct ggml_tensor * a,
1045
+ ggml_unary_op_f32_t fun);
1046
+
1047
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
1048
+ struct ggml_context * ctx,
1049
+ struct ggml_tensor * a,
1050
+ struct ggml_tensor * b,
1051
+ ggml_binary_op_f32_t fun);
1052
+
1053
+ // loss function
1054
+
1055
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1056
+ struct ggml_context * ctx,
1057
+ struct ggml_tensor * a,
1058
+ struct ggml_tensor * b);
1059
+
1060
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1061
+ struct ggml_context * ctx,
1062
+ struct ggml_tensor * a,
1063
+ struct ggml_tensor * b,
1064
+ struct ggml_tensor * c);
1065
+
1066
+ //
1067
+ // automatic differentiation
1068
+ //
1069
+
1070
+ GGML_API void ggml_set_param(
1071
+ struct ggml_context * ctx,
1072
+ struct ggml_tensor * tensor);
1073
+
1074
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1075
+
1076
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1077
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1078
+
1079
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1080
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1081
+
1082
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1083
+
1084
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1085
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1086
+
1087
+ // print info and performance information for the graph
1088
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
1089
+
1090
+ // dump the graph into a file using the dot format
1091
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1092
+
1093
+ //
1094
+ // optimization
1095
+ //
1096
+
1097
+ // optimization methods
1098
+ enum ggml_opt_type {
1099
+ GGML_OPT_ADAM,
1100
+ GGML_OPT_LBFGS,
1101
+ };
1102
+
1103
+ // linesearch methods
1104
+ enum ggml_linesearch {
1105
+ GGML_LINESEARCH_DEFAULT = 1,
1106
+
1107
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
1108
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
1109
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
1110
+ };
1111
+
1112
+ // optimization return values
1113
+ enum ggml_opt_result {
1114
+ GGML_OPT_OK = 0,
1115
+ GGML_OPT_DID_NOT_CONVERGE,
1116
+ GGML_OPT_NO_CONTEXT,
1117
+ GGML_OPT_INVALID_WOLFE,
1118
+ GGML_OPT_FAIL,
1119
+
1120
+ GGML_LINESEARCH_FAIL = -128,
1121
+ GGML_LINESEARCH_MINIMUM_STEP,
1122
+ GGML_LINESEARCH_MAXIMUM_STEP,
1123
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
1124
+ GGML_LINESEARCH_INVALID_PARAMETERS,
1125
+ };
1126
+
1127
+ // optimization parameters
1128
+ //
1129
+ // see ggml.c (ggml_opt_default_params) for default values
1130
+ //
1131
+ struct ggml_opt_params {
1132
+ enum ggml_opt_type type;
1133
+
1134
+ int n_threads;
1135
+
1136
+ // delta-based convergence test
1137
+ //
1138
+ // if past == 0 - disabled
1139
+ // if past > 0:
1140
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
1141
+ //
1142
+ int past;
1143
+ float delta;
1144
+
1145
+ // maximum number of iterations without improvement
1146
+ //
1147
+ // if 0 - disabled
1148
+ // if > 0:
1149
+ // assume convergence if no cost improvement in this number of iterations
1150
+ //
1151
+ int max_no_improvement;
1152
+
1153
+ bool print_forward_graph;
1154
+ bool print_backward_graph;
1155
+
1156
+ // ADAM parameters
1157
+ struct {
1158
+ int n_iter;
1159
+
1160
+ float sched; // schedule multiplier (fixed, decay or warmup)
1161
+ float decay; // weight decay for AdamW, use 0.0f to disable
1162
+ float alpha; // learning rate
1163
+ float beta1;
1164
+ float beta2;
1165
+ float eps; // epsilon for numerical stability
1166
+ float eps_f; // epsilon for convergence test
1167
+ float eps_g; // epsilon for convergence test
1168
+ } adam;
1169
+
1170
+ // LBFGS parameters
1171
+ struct {
1172
+ int m; // number of corrections to approximate the inv. Hessian
1173
+ int n_iter;
1174
+ int max_linesearch;
1175
+
1176
+ float eps; // convergence tolerance
1177
+ float ftol; // line search tolerance
1178
+ float wolfe;
1179
+ float min_step;
1180
+ float max_step;
1181
+
1182
+ enum ggml_linesearch linesearch;
1183
+ } lbfgs;
1184
+ };
1185
+
1186
+ struct ggml_opt_context {
1187
+ struct ggml_context * ctx;
1188
+ struct ggml_opt_params params;
1189
+
1190
+ int iter;
1191
+ int64_t nx; // number of parameter elements
1192
+
1193
+ bool just_initialized;
1194
+
1195
+ struct {
1196
+ struct ggml_tensor * x; // view of the parameters
1197
+ struct ggml_tensor * g1; // gradient
1198
+ struct ggml_tensor * g2; // gradient squared
1199
+ struct ggml_tensor * m; // first moment
1200
+ struct ggml_tensor * v; // second moment
1201
+ struct ggml_tensor * mh; // first moment hat
1202
+ struct ggml_tensor * vh; // second moment hat
1203
+ struct ggml_tensor * pf; // past function values
1204
+ float fx_best;
1205
+ float fx_prev;
1206
+ int n_no_improvement;
1207
+ } adam;
1208
+
1209
+ struct {
1210
+ struct ggml_tensor * x; // current parameters
1211
+ struct ggml_tensor * xp; // previous parameters
1212
+ struct ggml_tensor * g; // current gradient
1213
+ struct ggml_tensor * gp; // previous gradient
1214
+ struct ggml_tensor * d; // search direction
1215
+ struct ggml_tensor * pf; // past function values
1216
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1217
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1218
+ struct ggml_tensor * lms; // the L-BFGS memory s
1219
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1220
+ float fx_best;
1221
+ float step;
1222
+ int j;
1223
+ int k;
1224
+ int end;
1225
+ int n_no_improvement;
1226
+ } lbfgs;
1227
+ };
1228
+
1229
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1230
+
1231
+ // optimize the function defined by the tensor f
1232
+ GGML_API enum ggml_opt_result ggml_opt(
1233
+ struct ggml_context * ctx,
1234
+ struct ggml_opt_params params,
1235
+ struct ggml_tensor * f);
1236
+
1237
+ // initialize optimizer context
1238
+ GGML_API void ggml_opt_init(
1239
+ struct ggml_context * ctx,
1240
+ struct ggml_opt_context * opt,
1241
+ struct ggml_opt_params params,
1242
+ int64_t nx);
1243
+
1244
+ // continue optimizing the function defined by the tensor f
1245
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1246
+ struct ggml_context * ctx,
1247
+ struct ggml_opt_context * opt,
1248
+ struct ggml_tensor * f);
1249
+
1250
+ // continue optimizing the function defined by the tensor f
1251
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1252
+ struct ggml_context * ctx,
1253
+ struct ggml_opt_context * opt,
1254
+ struct ggml_tensor * f,
1255
+ struct ggml_cgraph * gf,
1256
+ struct ggml_cgraph * gb);
1257
+
1258
+ //
1259
+ // quantization
1260
+ //
1261
+
1262
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
1263
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
1264
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
1265
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
1266
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
1267
+
1268
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
1269
+
1270
+ //
1271
+ // system info
1272
+ //
1273
+
1274
+ GGML_API int ggml_cpu_has_avx (void);
1275
+ GGML_API int ggml_cpu_has_avx2 (void);
1276
+ GGML_API int ggml_cpu_has_avx512 (void);
1277
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
1278
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
1279
+ GGML_API int ggml_cpu_has_fma (void);
1280
+ GGML_API int ggml_cpu_has_neon (void);
1281
+ GGML_API int ggml_cpu_has_arm_fma (void);
1282
+ GGML_API int ggml_cpu_has_f16c (void);
1283
+ GGML_API int ggml_cpu_has_fp16_va (void);
1284
+ GGML_API int ggml_cpu_has_wasm_simd (void);
1285
+ GGML_API int ggml_cpu_has_blas (void);
1286
+ GGML_API int ggml_cpu_has_cublas (void);
1287
+ GGML_API int ggml_cpu_has_clblast (void);
1288
+ GGML_API int ggml_cpu_has_gpublas (void);
1289
+ GGML_API int ggml_cpu_has_sse3 (void);
1290
+ GGML_API int ggml_cpu_has_vsx (void);
1291
+
1292
+ //
1293
+ // Internal types and functions exposed for tests and benchmarks
1294
+ //
1295
+
1296
+ #ifdef __cplusplus
1297
+ // restrict not standard in C++
1298
+ #define GGML_RESTRICT
1299
+ #else
1300
+ #define GGML_RESTRICT restrict
1301
+ #endif
1302
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1303
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1304
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1305
+
1306
+ typedef struct {
1307
+ dequantize_row_q_t dequantize_row_q;
1308
+ quantize_row_q_t quantize_row_q;
1309
+ quantize_row_q_t quantize_row_q_reference;
1310
+ quantize_row_q_t quantize_row_q_dot;
1311
+ vec_dot_q_t vec_dot_q;
1312
+ enum ggml_type vec_dot_type;
1313
+ } quantize_fns_t;
1314
+
1315
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1316
+
1317
+ #ifdef __cplusplus
1318
+ }
1319
+ #endif