File size: 5,710 Bytes
b664585 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
#include "arg.h"
#include "common.h"
#include "log.h"
#include "ngram-cache.h"
#include "llama.h"
#include "ggml.h"
#include <cstdint>
#include <cstdio>
#include <cinttypes>
#include <fstream>
#include <string>
#include <vector>
int main(int argc, char ** argv){
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
return 1;
}
common_init();
const int n_draft = params.speculative.n_max;
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
// load the model
common_init_result llama_init = common_init_from_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
// tokenize the prompt
std::vector<llama_token> inp;
inp = common_tokenize(ctx, params.prompt, true, true);
common_ngram_cache ngram_cache_context;
common_ngram_cache ngram_cache_dynamic;
common_ngram_cache ngram_cache_static;
int64_t t_draft_flat_us = 0;
int64_t t_draft_us = 0;
{
const int64_t t_start_draft_us = ggml_time_us();
if (!params.lookup_cache_static.empty()) {
try {
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
}
const int n_input = inp.size();
const int n_ctx = llama_n_ctx(ctx);
int n_drafted = 0;
int n_accept = 0;
const int64_t t_start_ms = ggml_time_ms();
// Iterate over input tokens in chunks of size n_ctx.
// Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
std::vector<llama_token> pseudo_output;
pseudo_output.push_back(inp_slice[0]);
while ((int) pseudo_output.size() < n_ctx) {
// Simulate drafting and decoding from draft:
std::vector<llama_token> draft;
draft.push_back(pseudo_output.back());
{
const int64_t t_start_draft_us = ggml_time_us();
common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
n_drafted += draft.size() - 1;
for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
const llama_token ground_truth = inp_slice[pseudo_output.size()];
const llama_token drafted = draft[j];
if (ground_truth != drafted) {
break;
}
++n_accept;
pseudo_output.push_back(ground_truth);
{
const int64_t t_start_draft_us = ggml_time_us();
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
}
// After each simulated batch decoding simulate the sampling of a single token:
if ((int) pseudo_output.size() < n_ctx) {
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
{
const int64_t t_start_draft_us = ggml_time_us();
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
t_draft_us += ggml_time_us() - t_start_draft_us;
}
}
draft.erase(draft.begin());
}
if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
const int64_t t_now_ms = ggml_time_ms();
const int64_t eta_ms = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
}
// After each chunk, update the dynamic ngram cache with the context ngram cache:
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
ngram_cache_context.clear();
}
LOG("\n");
LOG_INF("\n");
LOG_INF("n_draft = %d\n", n_draft);
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
LOG_INF("n_accept = %d\n", n_accept);
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
LOG("\n\n");
return 0;
}
|