In [1]:
import llama_cpp

In [2]:
llama_cpp.llama_backend_init(numa=False)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2060, compute capability 7.5


In [3]:
params = llama_cpp.llama_model_default_params()
params.n_gpu_layers = 35
model = llama_cpp.llama_load_model_from_file(b"../../models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf", params=params) # Update this to whatever

llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../../models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_gate.weight q4_K     [  4096, 143

In [4]:
n_ctx = 512
n_len = 32
n_parallel = 2
prompt = b"The quick brown fox"

tokens = (llama_cpp.llama_token * n_ctx)()
tokens_len = llama_cpp.llama_tokenize(model, prompt, len(prompt), tokens, len(tokens), True, True)
print(tokens[:tokens_len])

n_kv_req = tokens_len + (n_len - tokens_len) * n_parallel
print(n_kv_req)

[1, 1014, 2936, 9060, 285, 1142]
58


In [5]:

ctx_params = llama_cpp.llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = n_kv_req
ctx_params.n_batch = max(n_len, n_parallel)
ctx_params.n_threads = 1
ctx_params.n_threads_batch = 1
ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)

llama_new_context_with_model: n_ctx      = 58
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init: offloading v cache to GPU
llama_kv_cache_init: offloading k cache to GPU
llama_kv_cache_init: VRAM kv self = 7.25 MB
llama_new_context_with_model: kv self size  =    7.25 MB
llama_build_graph: non-view tensors processed: 740/740
llama_new_context_with_model: compute buffer total size = 10.63 MB
llama_new_context_with_model: VRAM scratch buffer: 4.51 MB
llama_new_context_with_model: total VRAM used: 4106.81 MB (model: 4095.05 MB, context: 11.76 MB)


In [6]:
n_ctx = llama_cpp.llama_n_ctx(ctx)
batch = llama_cpp.llama_batch_init(max(tokens_len, n_parallel), 0, 1)

In [7]:
import ctypes

batch.n_tokens = tokens_len
for i in range(tokens_len):
    batch.token[i] = tokens[i]
    batch.pos[i] = i
    batch.seq_id[i][0] = 0
    batch.n_seq_id[i] = 1
    batch.logits[i] = False

batch.logits[batch.n_tokens - 1] = True

if llama_cpp.llama_decode(ctx, batch) != 0:
    print("Error decoding")

In [8]:
for i in range(n_parallel):
    llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)

In [9]:
import ctypes

streams = [""] * n_parallel
i_batch = [batch.n_tokens - 1] * n_parallel

n_cur = batch.n_tokens
n_decode = 0

while n_cur <= n_len:
    batch.n_tokens = 0
    for i in range(n_parallel):
        if i_batch[i] < 0:
            continue
        
        n_vocab = llama_cpp.llama_n_vocab(model)
        logits = llama_cpp.llama_get_logits_ith(ctx, i_batch[i])

        candidates = (llama_cpp.llama_token_data * n_vocab)()

        for token_id in range(n_vocab):
            candidates[token_id].id = token_id
            candidates[token_id].logit = logits[token_id]
            candidates[token_id].p = 0.0

        candidates_p = llama_cpp.llama_token_data_array(candidates, len(candidates), False)

        top_k = 40
        top_p = 0.9
        temp = 0.4

        llama_cpp.llama_sample_top_k(ctx, ctypes.byref(candidates_p), top_k, 1)
        llama_cpp.llama_sample_top_p(ctx, ctypes.byref(candidates_p), top_p, 1)
        llama_cpp.llama_sample_temp (ctx, ctypes.byref(candidates_p), temp)
        
        new_token_id = llama_cpp.llama_sample_token(ctx, ctypes.byref(candidates_p))

        if new_token_id == llama_cpp.llama_token_eos(ctx) or n_cur == n_len:
            i_batch[i] = -1
            continue

        buf = (ctypes.c_char * 32)()
        outlen = llama_cpp.llama_token_to_piece(model, new_token_id, buf, len(buf))
        streams[i] += bytes(buf[:outlen]).decode("utf-8")

        batch.token[batch.n_tokens] = new_token_id
        batch.pos[batch.n_tokens] = n_cur
        batch.seq_id[batch.n_tokens][0] = i
        batch.n_seq_id[batch.n_tokens] = 1
        batch.logits[batch.n_tokens] = True

        i_batch[i] = batch.n_tokens
        batch.n_tokens += 1
        n_decode += 1
    
    if batch.n_tokens == 0:
        break

    n_cur += 1

    if llama_cpp.llama_decode(ctx, batch) != 0:
        print("Error decoding", flush=True)
        break
    print(n_cur)
    print(streams)


7
[' j', ' jumped']
8
[' jumps', ' jumped over']
9
[' jumps over', ' jumped over the']
10
[' jumps over the', ' jumped over the lazy']
11
[' jumps over the lazy', ' jumped over the lazy dog']
12
[' jumps over the lazy dog', ' jumped over the lazy dog.']
13
[' jumps over the lazy dog.', ' jumped over the lazy dog.\n']
14
[' jumps over the lazy dog.\n', ' jumped over the lazy dog.\n\n']
15
[' jumps over the lazy dog.\n\n', ' jumped over the lazy dog.\n\nThe']
16
[' jumps over the lazy dog.\n\nI', ' jumped over the lazy dog.\n\nThe quick']
17
[' jumps over the lazy dog.\n\nI’', ' jumped over the lazy dog.\n\nThe quick brown']
18
[' jumps over the lazy dog.\n\nI’m', ' jumped over the lazy dog.\n\nThe quick brown f']
19
[' jumps over the lazy dog.\n\nI’m not', ' jumped over the lazy dog.\n\nThe quick brown fox']
20
[' jumps over the lazy dog.\n\nI’m not sure', ' jumped over the lazy dog.\n\nThe quick brown fox jumped']
21
[' jumps over the lazy dog.\n\nI’m not sure if', ' jumped over the la

In [10]:
print(streams)

[' jumps over the lazy dog.\n\nI’m not sure if that’s the most famous sentence in the English language', ' jumped over the lazy dog.\n\nThe quick brown fox jumped over the lazy dog.\n\nThe quick brown fox']


In [11]:
llama_cpp.llama_batch_free(batch)

In [12]:
llama_cpp.llama_free(ctx)

In [13]:
llama_cpp.llama_free_model(model)

In [14]:
llama_cpp.llama_backend_free()