mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
7 Commits
b1843
...
gg/add-phi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9998ecd191 | ||
|
|
15ebe59210 | ||
|
|
de473f5f8e | ||
|
|
f238461236 | ||
|
|
fa5c1fb44a | ||
|
|
52ee4540c0 | ||
|
|
3fe81781e3 |
@@ -23,6 +23,15 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
import gguf
|
||||
|
||||
|
||||
# check for any of the given keys in the dictionary and return the value of the first key found
|
||||
def get_key_opts(d, keys):
|
||||
for k in keys:
|
||||
if k in d:
|
||||
return d[k]
|
||||
print(f"Could not find any of {keys}")
|
||||
sys.exit()
|
||||
|
||||
|
||||
###### MODEL DEFINITIONS ######
|
||||
|
||||
class SentencePieceTokenTypes(IntEnum):
|
||||
@@ -257,10 +266,11 @@ class Model:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
if hasattr(tokenizer, "added_tokens_decoder"):
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
@@ -1068,20 +1078,34 @@ class GPT2Model(Model):
|
||||
|
||||
class Phi2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
|
||||
|
||||
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
|
||||
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
|
||||
|
||||
if "partial_rotary_factor" in self.hparams:
|
||||
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
|
||||
n_rot = int(rot_pct * n_embd) // n_head
|
||||
else:
|
||||
n_rot = get_key_opts(self.hparams, ["rotary_dim", "n_rot"])
|
||||
|
||||
self.gguf_writer.add_name("Phi2")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
|
||||
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
self.gguf_writer.add_head_count_kv(n_head)
|
||||
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
|
||||
self.gguf_writer.add_rope_dimension_count(n_rot)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
# phixtral
|
||||
self.gguf_writer.add_expert_count(self.hparams.get("num_local_experts", 0))
|
||||
self.gguf_writer.add_expert_used_count(self.hparams.get("num_experts_per_tok", 0))
|
||||
|
||||
|
||||
class PlamoModel(Model):
|
||||
def set_vocab(self):
|
||||
|
||||
136
examples/pydantic-models-to-grammar-examples.py
Normal file
136
examples/pydantic-models-to-grammar-examples.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# Function calling example using pydantic models.
|
||||
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Union, Optional
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
import importlib
|
||||
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
|
||||
|
||||
# Function to get completion on the llama.cpp server with grammar.
|
||||
def create_completion(prompt, grammar):
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {"prompt": prompt, "grammar": grammar}
|
||||
|
||||
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
|
||||
data = response.json()
|
||||
|
||||
print(data["content"])
|
||||
return data["content"]
|
||||
|
||||
|
||||
# A function for the agent to send a message to the user.
|
||||
class SendMessageToUser(BaseModel):
|
||||
"""
|
||||
Send a message to the User.
|
||||
"""
|
||||
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
|
||||
message: str = Field(..., description="Message you want to send to the user.")
|
||||
|
||||
def run(self):
|
||||
print(self.message)
|
||||
|
||||
|
||||
# Enum for the calculator function.
|
||||
class MathOperation(Enum):
|
||||
ADD = "add"
|
||||
SUBTRACT = "subtract"
|
||||
MULTIPLY = "multiply"
|
||||
DIVIDE = "divide"
|
||||
|
||||
|
||||
# Very simple calculator tool for the agent.
|
||||
class Calculator(BaseModel):
|
||||
"""
|
||||
Perform a math operation on two numbers.
|
||||
"""
|
||||
number_one: Union[int, float] = Field(..., description="First number.")
|
||||
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||
|
||||
def run(self):
|
||||
if self.operation == MathOperation.ADD:
|
||||
return self.number_one + self.number_two
|
||||
elif self.operation == MathOperation.SUBTRACT:
|
||||
return self.number_one - self.number_two
|
||||
elif self.operation == MathOperation.MULTIPLY:
|
||||
return self.number_one * self.number_two
|
||||
elif self.operation == MathOperation.DIVIDE:
|
||||
return self.number_one / self.number_two
|
||||
else:
|
||||
raise ValueError("Unknown operation.")
|
||||
|
||||
|
||||
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
|
||||
# pydantic_model_list is the list of pydanitc models
|
||||
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
|
||||
# outer_object_content is the name of outer object content.
|
||||
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
|
||||
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
|
||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
|
||||
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
||||
|
||||
print(gbnf_grammar)
|
||||
print(documentation)
|
||||
|
||||
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
||||
|
||||
user_message = "What is 42 * 42?"
|
||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
|
||||
|
||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
||||
# This should output something like this:
|
||||
# {
|
||||
# "function": "calculator",
|
||||
# "function_parameters": {
|
||||
# "number_one": 42,
|
||||
# "operation": "multiply",
|
||||
# "number_two": 42
|
||||
# }
|
||||
# }
|
||||
function_dictionary = json.loads(text)
|
||||
if function_dictionary["function"] == "calculator":
|
||||
function_parameters = {**function_dictionary["function_parameters"]}
|
||||
|
||||
print(Calculator(**function_parameters).run())
|
||||
# This should output: 1764
|
||||
|
||||
|
||||
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
|
||||
class Category(Enum):
|
||||
"""
|
||||
The category of the book.
|
||||
"""
|
||||
Fiction = "Fiction"
|
||||
NonFiction = "Non-Fiction"
|
||||
|
||||
|
||||
class Book(BaseModel):
|
||||
"""
|
||||
Represents an entry about a book.
|
||||
"""
|
||||
title: str = Field(..., description="Title of the book.")
|
||||
author: str = Field(..., description="Author of the book.")
|
||||
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
|
||||
keywords: list[str] = Field(..., description="A list of keywords.")
|
||||
category: Category = Field(..., description="Category of the book.")
|
||||
summary: str = Field(..., description="Summary of the book.")
|
||||
|
||||
|
||||
# We need no additional parameters other than our list of pydantic models.
|
||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
|
||||
|
||||
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
|
||||
|
||||
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
|
||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||
|
||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
||||
|
||||
json_data = json.loads(text)
|
||||
|
||||
print(Book(**json_data))
|
||||
1151
examples/pydantic_models_to_grammar.py
Normal file
1151
examples/pydantic_models_to_grammar.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1087,6 +1087,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pass 2.4 expand rest down
|
||||
{
|
||||
ggml_tallocr_t cur_allocr = NULL;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
if (node_allocr != NULL) {
|
||||
cur_allocr = node_allocr;
|
||||
} else {
|
||||
node_allocr(node) = cur_allocr;
|
||||
SET_CAUSE(node, "2.4");
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG_PASS2
|
||||
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
@@ -1146,6 +1164,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
|
||||
GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
|
||||
|
||||
if (node_allocr != cur_allocr) {
|
||||
sched->splits[cur_split].i_end = i;
|
||||
cur_split++;
|
||||
|
||||
57
ggml-cuda.cu
57
ggml-cuda.cu
@@ -523,6 +523,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
||||
#define CUDA_ACC_BLOCK_SIZE 256
|
||||
#define CUDA_IM2COL_BLOCK_SIZE 256
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
// dmmv = dequantize_mul_mat_vec
|
||||
#ifndef GGML_CUDA_DMMV_X
|
||||
#define GGML_CUDA_DMMV_X 32
|
||||
@@ -2327,6 +2329,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
||||
y[i] = x[i];
|
||||
}
|
||||
|
||||
template <bool need_check>
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||
|
||||
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
||||
const int * x0 = ((int *) vx) + blockIdx.x * nint;
|
||||
half2 * y2 = (half2 *) (y + i0);
|
||||
|
||||
__shared__ int vals[nint];
|
||||
|
||||
#pragma unroll
|
||||
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
|
||||
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = ix0 + threadIdx.x;
|
||||
vals[ix] = x0[ix];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
|
||||
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
|
||||
const half d = *b0;
|
||||
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
|
||||
|
||||
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
|
||||
}
|
||||
#else
|
||||
(void) vx; (void) y; (void) k;
|
||||
bad_arch();
|
||||
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
@@ -6181,6 +6222,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||
const bool need_check = false;
|
||||
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
||||
} else {
|
||||
const bool need_check = true;
|
||||
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
@@ -6246,6 +6298,7 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
|
||||
}
|
||||
|
||||
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
int id;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
@@ -6256,6 +6309,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
if (g_device_caps[id].cc >= CC_PASCAL) {
|
||||
return dequantize_block_q8_0_f16_cuda;
|
||||
}
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
|
||||
@@ -272,10 +272,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||
|
||||
// vaddvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
@@ -291,6 +294,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
@@ -316,6 +325,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
@@ -7554,9 +7585,9 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
int8x16x4_t q2u;
|
||||
int8x16x4_t q2s;
|
||||
int8x16x4_t q8b;
|
||||
ggml_int8x16x4_t q2u;
|
||||
ggml_int8x16x4_t q2s;
|
||||
ggml_int8x16x4_t q8b;
|
||||
|
||||
int32x4x4_t scales32;
|
||||
|
||||
@@ -7578,7 +7609,7 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||
scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
|
||||
int32x4_t sumi = vdupq_n_s32(0);
|
||||
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
|
||||
q8b = vld1q_s8_x4(q8); q8 += 64;
|
||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
|
||||
q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
|
||||
q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
|
||||
|
||||
@@ -389,10 +389,16 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
]
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -173,6 +173,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
"transformer.h.{bid}.moe.gate", # phixtral
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -191,12 +192,14 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
"h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
|
||||
"transformer.h.{bid}.moe.mlp.{xid}.fc1", # phixtral
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
@@ -232,12 +235,14 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
"h.{bid}.mlp.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
|
||||
"transformer.h.{bid}.moe.mlp.{xid}.fc2", # phixtral
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
|
||||
141
llama.cpp
141
llama.cpp
@@ -574,9 +574,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -1422,16 +1428,20 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down; // w2
|
||||
struct ggml_tensor * ffn_up; // w3
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_down_b; // b2
|
||||
struct ggml_tensor * ffn_up_b; // b3
|
||||
struct ggml_tensor * ffn_act;
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp;
|
||||
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
|
||||
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
|
||||
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
|
||||
|
||||
// ff bias
|
||||
struct ggml_tensor * ffn_down_b; // b2
|
||||
struct ggml_tensor * ffn_up_b; // b3
|
||||
struct ggml_tensor * ffn_act;
|
||||
// ff MoE bias
|
||||
struct ggml_tensor * ffn_down_b_exp[LLAMA_MAX_EXPERTS];
|
||||
struct ggml_tensor * ffn_up_b_exp [LLAMA_MAX_EXPERTS];
|
||||
};
|
||||
|
||||
struct llama_kv_cell {
|
||||
@@ -3676,17 +3686,46 @@ static bool llm_load_tensors(
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
||||
|
||||
if (layer.wqkv == nullptr) {
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
||||
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
||||
}
|
||||
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||
if (layer.ffn_gate_inp == nullptr) {
|
||||
GGML_ASSERT(hparams.n_expert == 0);
|
||||
GGML_ASSERT(hparams.n_expert_used == 0);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||
} else {
|
||||
GGML_ASSERT(hparams.n_expert > 0);
|
||||
GGML_ASSERT(hparams.n_expert_used > 0);
|
||||
|
||||
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
||||
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd});
|
||||
layer.ffn_down_b_exp[x] = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN_EXP, "bias", i, x), {n_embd});
|
||||
|
||||
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
|
||||
layer.ffn_up_b_exp[x] = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP_EXP, "bias", i, x), {n_ff});
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO:
|
||||
@@ -5637,15 +5676,25 @@ struct llm_build_context {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
||||
cb(cur, "wqkv", il);
|
||||
struct ggml_tensor * Qcur = nullptr;
|
||||
struct ggml_tensor * Kcur = nullptr;
|
||||
struct ggml_tensor * Vcur = nullptr;
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
if (model.layers[il].wqkv) {
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
} else {
|
||||
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
||||
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
||||
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
@@ -5680,7 +5729,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
// FF
|
||||
{
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
@@ -5688,6 +5737,62 @@ struct llm_build_context {
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(ffn_output, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||
cb(logits, "ffn_moe_logits", il);
|
||||
|
||||
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
||||
cb(probs, "ffn_moe_probs", il);
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
||||
cb(weights, "ffn_moe_weights", il);
|
||||
|
||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
||||
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||
|
||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
||||
cb(weights, "ffn_moe_weights_norm", il);
|
||||
|
||||
// compute expert outputs
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
|
||||
for (int i = 0; i < n_expert_used; ++i) {
|
||||
ggml_tensor * cur_expert;
|
||||
|
||||
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
||||
#pragma message "TODO: implement ggml_add_id"
|
||||
//cur_up = ggml_add_id(ctx0, cur_up, model.layers[il].ffn_up_b_exp, n_expert, selected_experts, i);
|
||||
cb(cur_up, "ffn_moe_up", il);
|
||||
|
||||
cur_up = ggml_gelu(ctx0, cur_up);
|
||||
cb(cur_up, "ffn_moe_gelu", il);
|
||||
|
||||
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_up); // [n_tokens, n_embd]
|
||||
#pragma message "TODO: implement ggml_add_id"
|
||||
//cur_expert = ggml_add_id(ctx0, cur_expert, model.layers[il].ffn_down_b_exp, n_expert, selected_experts, i);
|
||||
cb(cur_expert, "ffn_moe_down", il);
|
||||
|
||||
cur_expert = ggml_mul(ctx0, cur_expert,
|
||||
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
||||
cb(cur_expert, "ffn_moe_weighted", il);
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
}
|
||||
}
|
||||
|
||||
ffn_output = moe_out;
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_output);
|
||||
|
||||
@@ -1 +1 @@
|
||||
979cc23b345006504cfc1f67c0fdf627805e3319
|
||||
400c07f00508e6f60fb25405444b5669c365b0a9
|
||||
|
||||
Reference in New Issue
Block a user