mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3303c19b16 | ||
|
|
4fdea540bd | ||
|
|
a4569c41fd | ||
|
|
15e92fd337 | ||
|
|
2bf3fbf0b5 | ||
|
|
711d5e6fe6 | ||
|
|
f738989dcb | ||
|
|
4cb208c93c | ||
|
|
3025b621d1 |
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
Normal file
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Check Pre-Tokenizer Hashes
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
|
||||
|
||||
- name: Update pre-tokenizer hashes
|
||||
run: |
|
||||
cp convert_hf_to_gguf.py /tmp
|
||||
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||
|
||||
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||
run: |
|
||||
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||
echo "Differences found:"
|
||||
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Model pre-tokenizer hashes are up to date."
|
||||
@@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
||||
);
|
||||
|
||||
if (auto res = builder.try_find_regex(open_regex)) {
|
||||
while (auto res = builder.try_find_regex(open_regex)) {
|
||||
const auto & block_start = res->groups[1];
|
||||
std::string block_end = block_start.empty() ? "" : "```";
|
||||
|
||||
@@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
builder.consume_literal(block_end);
|
||||
builder.consume_spaces();
|
||||
}
|
||||
builder.add_content(builder.consume_rest());
|
||||
} else {
|
||||
throw common_chat_msg_partial_exception("failed to parse tool call");
|
||||
}
|
||||
@@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
builder.consume_spaces();
|
||||
}
|
||||
}
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
} else {
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
|
||||
@@ -702,6 +702,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
|
||||
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
|
||||
res = "kimi-k2"
|
||||
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
|
||||
res = "qwen2"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
@@ -849,9 +852,6 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
||||
res = "exaone4"
|
||||
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-8B
|
||||
res = "qwen2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
||||
@@ -59,6 +59,10 @@ parser.add_argument(
|
||||
"--full", action="store_true",
|
||||
help="download full list of models - make sure you have access to all of them",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check-missing", action="store_true",
|
||||
help="only check for missing pre-tokenizer hashes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"hf_token",
|
||||
help="optional HF token",
|
||||
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||
if hf_token is None:
|
||||
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||
|
||||
if args.check_missing and args.full:
|
||||
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
|
||||
args.check_missing = False
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
@@ -147,6 +155,7 @@ pre_computed_hashes = [
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
|
||||
]
|
||||
|
||||
|
||||
@@ -221,12 +230,13 @@ if not args.full:
|
||||
all_models = models.copy()
|
||||
models = [model for model in all_models if model["name"] not in existing_models]
|
||||
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
if not args.check_missing:
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
||||
@@ -1852,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
|
||||
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
|
||||
|
||||
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
||||
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
||||
|
||||
// Handle src0
|
||||
src0_ptr = (const cuda_t *) src0->data;
|
||||
|
||||
@@ -1870,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||
s11 = ne10;
|
||||
s12 = ne11*s11;
|
||||
s13 = ne12*s12;
|
||||
|
||||
is_src1_cont_2 = true;
|
||||
}
|
||||
|
||||
// Setup destination buffer
|
||||
@@ -1918,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
||||
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
||||
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
||||
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
|
||||
src1_ptr, cu_data_type_b, s11, s12, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
|
||||
src1_ptr, cu_data_type_b, s11, smb, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
ne12*ne13,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
@@ -1,65 +1,75 @@
|
||||
#include "im2col.cuh"
|
||||
|
||||
#define MIN(a, b) (a) < (b) ? (a) : (b)
|
||||
|
||||
#define MAX_GRIDDIM_Z 65535
|
||||
|
||||
template <typename T>
|
||||
static __global__ void im2col_kernel(
|
||||
const float * x, T * dst, int64_t batch_offset,
|
||||
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
|
||||
const float * x, T * dst,
|
||||
int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
|
||||
int64_t IC_IH_IW, int64_t IH_IW, int64_t N_OH, int64_t KH_KW, int64_t IC_KH_KW,
|
||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i >= pelements) {
|
||||
if (i >= IC_KH_KW) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ksize = OW * KH;
|
||||
const int64_t kx = i / ksize;
|
||||
const int64_t kd = kx * ksize;
|
||||
const int64_t ky = (i - kd) / OW;
|
||||
const int64_t ix = i % OW;
|
||||
const int64_t iic = i / (KH_KW);
|
||||
const int64_t rem = i - iic * KH_KW;
|
||||
const int64_t ikh = rem / KW;
|
||||
const int64_t ikw = rem - ikh * KW;
|
||||
|
||||
const int64_t oh = blockIdx.y;
|
||||
const int64_t batch = blockIdx.z / IC;
|
||||
const int64_t ic = blockIdx.z % IC;
|
||||
const int64_t iow = blockIdx.y;
|
||||
for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
|
||||
const int64_t in = iz / OH;
|
||||
const int64_t ioh = iz - in * OH;
|
||||
|
||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||
const int64_t iih = oh * s1 + ky * d1 - p1;
|
||||
const int64_t iiw = iow * s0 + ikw * d0 - p0;
|
||||
const int64_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
const int64_t offset_dst =
|
||||
((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
template <typename T>
|
||||
static void im2col_cuda(const float * x, T* dst,
|
||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
const int parallel_elements = OW * KW * KH;
|
||||
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks, OH, batch * IC);
|
||||
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
const int64_t IC_KH_KW = IC * KH * KW;
|
||||
const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
const int64_t N_OH = N * OH;
|
||||
const int64_t KH_KW = KW*KH;
|
||||
dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
|
||||
im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
|
||||
IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
|
||||
s0, s1, p0, p1, d0, d1);
|
||||
}
|
||||
|
||||
static void im2col_cuda_f16(const float * x, half * dst,
|
||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
|
||||
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
}
|
||||
|
||||
static void im2col_cuda_f32(const float * x, float * dst,
|
||||
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
|
||||
int64_t batch, int64_t batch_offset, int64_t offset_delta,
|
||||
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
|
||||
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -91,13 +101,13 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
||||
const int64_t OW = dst->ne[1];
|
||||
|
||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t batch = src1->ne[is_2D ? 3 : 2];
|
||||
const size_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t IC_IH_IW = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
||||
const int64_t IH_IW = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
if(dst->type == GGML_TYPE_F16) {
|
||||
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
} else {
|
||||
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
|
||||
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2688,6 +2688,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
const size_t type_size_src0 = ggml_type_size(src0->type);
|
||||
const size_t type_size_src1 = ggml_type_size(src1->type);
|
||||
|
||||
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
|
||||
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
|
||||
|
||||
// SRC1 strides
|
||||
int64_t s11 = nb11 / type_size_src1;
|
||||
int64_t s12 = nb12 / type_size_src1;
|
||||
@@ -2737,6 +2740,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
s11 = ne10;
|
||||
s12 = ne11 * s11;
|
||||
s13 = ne12 * s12;
|
||||
|
||||
is_src1_cont_2 = true;
|
||||
}
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
||||
@@ -2852,12 +2857,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
|
||||
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
|
||||
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
|
||||
const int64_t smb = ne12 == 1 ? s13 : s12;
|
||||
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12 * ne13;
|
||||
|
||||
@@ -2106,12 +2106,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
s_mmq_wg_denoms = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (Qi_K)
|
||||
l_warptile_mmq_k = { 256, 64, 128, 64, 1 };
|
||||
m_warptile_mmq_k = { 256, 32, 64, 64, 0 };
|
||||
s_warptile_mmq_k = { 256, 32, 32, 128, 0 };
|
||||
l_mmq_wg_denoms_k = { 64, 128, 1 };
|
||||
m_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 32, 1 };
|
||||
l_warptile_mmq_k = { 256, 128, 256, 64, 1 };
|
||||
m_warptile_mmq_k = { 256, 128, 128, 64, 1 };
|
||||
s_warptile_mmq_k = { 256, 32, 64, 128, 0 };
|
||||
l_mmq_wg_denoms_k = { 128, 256, 1 };
|
||||
m_mmq_wg_denoms_k = { 128, 128, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul_id
|
||||
l_warptile_mmqid = { 256, 128, 128, 16, 0 };
|
||||
@@ -5022,26 +5022,37 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, const vk_pipeline& pipeline) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
|
||||
|
||||
uint32_t split_k = 1;
|
||||
if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
|
||||
if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) {
|
||||
// If k is 'large' and the SMs will fill less than halfway, use split_k.
|
||||
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
|
||||
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
|
||||
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
|
||||
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
|
||||
// Clamp to 2 or 4
|
||||
split_k = std::min(split_k, 4u);
|
||||
if (split_k == 3) {
|
||||
split_k = 2;
|
||||
|
||||
if (k >= 2048) {
|
||||
if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) {
|
||||
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
|
||||
} else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) {
|
||||
split_k = 3;
|
||||
}
|
||||
if (ctx->device->coopmat2) {
|
||||
// coopmat2 shader expects splits to be aligned to 256
|
||||
while (split_k > 1 && ((k / split_k) % 256) != 0) {
|
||||
split_k /= 2;
|
||||
// Cap the split at 8x. Unless k is huge this is a lot of overhead.
|
||||
split_k = std::min(split_k, 8u);
|
||||
|
||||
// ggml_vk_matmul will align the splits to be a multiple of 256.
|
||||
// If this rounded up size would cause the last split to be empty,
|
||||
// then reduce the split count.
|
||||
while (true) {
|
||||
if (split_k == 1) {
|
||||
break;
|
||||
}
|
||||
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||
k_split = ROUNDUP_POW2(k_split, 256);
|
||||
if (k_split * (split_k - 1) < k) {
|
||||
break;
|
||||
}
|
||||
split_k--;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5053,9 +5064,22 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count;
|
||||
const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]);
|
||||
const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]);
|
||||
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
|
||||
// Prefer large over medium if either:
|
||||
// - medium or large tiles would overfill the GPU
|
||||
// - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not
|
||||
// (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead)
|
||||
bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count ||
|
||||
// split_k==3 with large tiles likely better than medium tiles with no split_k.
|
||||
(tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2);
|
||||
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||
@@ -5099,7 +5123,11 @@ static void ggml_vk_matmul(
|
||||
|
||||
GGML_ASSERT(batch_stride_d == m * n);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Round the split size up to a multiple of 256 (k-quant alignment)
|
||||
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||
k_split = ROUNDUP_POW2(k_split, 256);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
@@ -1,7 +1 @@
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.2.1; platform_machine != "s390x"
|
||||
|
||||
# torch s390x packages can only be found from nightly builds
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
||||
torch>=0.0.0.dev0; platform_machine == "s390x"
|
||||
|
||||
@@ -326,7 +326,7 @@ class LlamaBenchDataSQLite3(LlamaBenchData):
|
||||
|
||||
# Set table name and schema based on tool
|
||||
if self.tool == "llama-bench":
|
||||
self.table_name = "test"
|
||||
self.table_name = "llama_bench"
|
||||
db_fields = LLAMA_BENCH_DB_FIELDS
|
||||
db_types = LLAMA_BENCH_DB_TYPES
|
||||
elif self.tool == "test-backend-ops":
|
||||
@@ -409,8 +409,8 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
||||
|
||||
# Tool selection logic
|
||||
if tool is None:
|
||||
if "test" in table_names:
|
||||
self.table_name = "test"
|
||||
if "llama_bench" in table_names:
|
||||
self.table_name = "llama_bench"
|
||||
self.tool = "llama-bench"
|
||||
elif "test_backend_ops" in table_names:
|
||||
self.table_name = "test_backend_ops"
|
||||
@@ -418,8 +418,8 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
|
||||
else:
|
||||
raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
|
||||
elif tool == "llama-bench":
|
||||
if "test" in table_names:
|
||||
self.table_name = "test"
|
||||
if "llama_bench" in table_names:
|
||||
self.table_name = "llama_bench"
|
||||
self.tool = "llama-bench"
|
||||
else:
|
||||
raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
|
||||
|
||||
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
||||
|
||||
{
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
|
||||
|
||||
if (!supports_set_rows && !cparams.kv_unified) {
|
||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||
|
||||
@@ -289,7 +289,7 @@ private:
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = false;
|
||||
bool supports_set_rows = true;
|
||||
|
||||
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||
bool graph_reuse_disable = false;
|
||||
|
||||
@@ -183,7 +183,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
const size_t memory_size_k = size_k_bytes();
|
||||
const size_t memory_size_v = size_v_bytes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
@@ -193,7 +193,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
|
||||
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
||||
|
||||
@@ -230,7 +230,7 @@ private:
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = false;
|
||||
bool supports_set_rows = true;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
|
||||
@@ -953,6 +953,33 @@ static void test_template_output_parsers() {
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
||||
|
||||
// Test multiple tool calls
|
||||
common_chat_msg message_assist_multiple_calls;
|
||||
message_assist_multiple_calls.role = "assistant";
|
||||
message_assist_multiple_calls.content = "";
|
||||
message_assist_multiple_calls.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
|
||||
message_assist_multiple_calls.tool_calls.push_back({"python", "{\"code\":\"print('hello')\"}", ""});
|
||||
|
||||
assert_msg_equals(
|
||||
message_assist_multiple_calls,
|
||||
common_chat_parse(
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||
"</tool_call>\n"
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"python\", \"arguments\": {\"code\":\"print('hello')\"}}\n"
|
||||
"</tool_call>",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
||||
|
||||
assert_msg_equals(
|
||||
message_assist_multiple_calls,
|
||||
common_chat_parse(
|
||||
"<function=special_function>{\"arg1\": 1}</function>\n"
|
||||
"<function=python>{\"code\":\"print('hello')\"}</function>",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
|
||||
|
||||
assert_msg_equals(
|
||||
simple_assist_msg(
|
||||
"This is not a tool call:",
|
||||
@@ -1039,6 +1066,22 @@ static void test_template_output_parsers() {
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||
"</tool_call>");
|
||||
|
||||
// Test multiple tool calls with template
|
||||
common_chat_msg message_assist_multiple_calls_template;
|
||||
message_assist_multiple_calls_template.role = "assistant";
|
||||
message_assist_multiple_calls_template.content = "";
|
||||
message_assist_multiple_calls_template.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
|
||||
message_assist_multiple_calls_template.tool_calls.push_back({"python", "{\"code\":\"print('test')\"}", ""});
|
||||
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_multiple_calls_template, tools,
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||
"</tool_call>\n"
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"python\", \"arguments\": {\"code\":\"print('test')\"}}\n"
|
||||
"</tool_call>");
|
||||
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
|
||||
|
||||
@@ -1738,7 +1738,7 @@ struct sql_printer : public printer {
|
||||
|
||||
void print_header(const cmd_params & params) override {
|
||||
std::vector<std::string> fields = test::get_fields();
|
||||
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
||||
fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
|
||||
for (size_t i = 0; i < fields.size(); i++) {
|
||||
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
||||
i < fields.size() - 1 ? "," : "");
|
||||
@@ -1749,7 +1749,7 @@ struct sql_printer : public printer {
|
||||
}
|
||||
|
||||
void print_test(const test & t) override {
|
||||
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
|
||||
fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
|
||||
fprintf(fout, "VALUES (");
|
||||
std::vector<std::string> values = t.get_values();
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
|
||||
Reference in New Issue
Block a user