mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-02 16:13:48 +03:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
278521c33a | ||
|
|
e2eb39e81c | ||
|
|
abf9a62161 | ||
|
|
7c203670f8 | ||
|
|
ec16a072f0 | ||
|
|
f5d1c4179f | ||
|
|
2405d59cb6 | ||
|
|
afe65aa282 |
@@ -33,6 +33,23 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& dpkg --install *.deb
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
|
||||
2
.github/workflows/python-type-check.yml
vendored
2
.github/workflows/python-type-check.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.11"
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.24
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.26
|
||||
# - name: Type-check with Pyright
|
||||
# uses: jakebailey/pyright-action@v2
|
||||
# with:
|
||||
|
||||
@@ -31,10 +31,10 @@ import gguf
|
||||
from gguf.vocab import MistralTokenizerType, MistralVocab
|
||||
|
||||
try:
|
||||
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
|
||||
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
|
||||
SentencePieceTokenizer,
|
||||
)
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import os
|
||||
|
||||
# Add utils directory to path for direct script execution
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
|
||||
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
|
||||
|
||||
def quick_logits_check(pytorch_file, llamacpp_file):
|
||||
"""Lightweight sanity check before NMSE"""
|
||||
|
||||
@@ -5,7 +5,7 @@ import sys
|
||||
import os
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from common import get_model_name_from_env_path # type: ignore[import-not-found]
|
||||
from common import get_model_name_from_env_path # type: ignore[import-not-found, ty:unresolved-import]
|
||||
|
||||
def calculate_nmse(reference, test):
|
||||
mse = np.mean((test - reference) ** 2)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from common import compare_tokens # type: ignore[import-not-found]
|
||||
from common import compare_tokens # type: ignore[import-not-found, ty:unresolved-import]
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
|
||||
@@ -7,7 +7,7 @@ import importlib
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
|
||||
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
|
||||
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
|
||||
|
||||
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
|
||||
|
||||
|
||||
@@ -20,4 +20,4 @@ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA
|
||||
#cmake --build . --config Release --target llama-bench
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -j -v
|
||||
cmake --build . --config Release -j$((($(nproc)+1)/2)) -v
|
||||
|
||||
@@ -23,9 +23,9 @@ if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
|
||||
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
|
||||
fi
|
||||
|
||||
@@ -2343,7 +2343,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
|
||||
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) {
|
||||
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
|
||||
if (ne2 <= mmvq_mmid_max) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
@@ -2946,14 +2947,18 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
}
|
||||
|
||||
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
|
||||
if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) {
|
||||
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
|
||||
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
|
||||
use_cuda_graph = false;
|
||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
|
||||
if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
|
||||
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
|
||||
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
|
||||
use_cuda_graph = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_cuda_graph) {
|
||||
|
||||
@@ -97,6 +97,194 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
}
|
||||
|
||||
// Per-architecture maximum batch size for which MMVQ should be used for MUL_MAT_ID.
|
||||
// Returns a value <= MMVQ_MAX_BATCH_SIZE. Default is MMVQ_MAX_BATCH_SIZE.
|
||||
// Check https://github.com/ggml-org/llama.cpp/pull/20905#issuecomment-4145835627 for details
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ1_S: return 6;
|
||||
case GGML_TYPE_IQ1_M: return 6;
|
||||
case GGML_TYPE_IQ2_S: return 4;
|
||||
case GGML_TYPE_IQ2_XS: return 5;
|
||||
case GGML_TYPE_IQ2_XXS: return 5;
|
||||
case GGML_TYPE_IQ3_S: return 4;
|
||||
case GGML_TYPE_IQ3_XXS: return 4;
|
||||
case GGML_TYPE_IQ4_NL: return 6;
|
||||
case GGML_TYPE_IQ4_XS: return 5;
|
||||
case GGML_TYPE_MXFP4: return 4;
|
||||
case GGML_TYPE_Q2_K: return 4;
|
||||
case GGML_TYPE_Q3_K: return 4;
|
||||
case GGML_TYPE_Q4_0: return 6;
|
||||
case GGML_TYPE_Q4_1: return 6;
|
||||
case GGML_TYPE_Q4_K: return 5;
|
||||
case GGML_TYPE_Q5_0: return 6;
|
||||
case GGML_TYPE_Q5_1: return 6;
|
||||
case GGML_TYPE_Q5_K: return 5;
|
||||
case GGML_TYPE_Q6_K: return 4;
|
||||
case GGML_TYPE_Q8_0: return 4;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_S: return 7;
|
||||
case GGML_TYPE_IQ3_S: return 6;
|
||||
case GGML_TYPE_IQ3_XXS: return 7;
|
||||
case GGML_TYPE_MXFP4: return 7;
|
||||
case GGML_TYPE_Q2_K: return 7;
|
||||
case GGML_TYPE_Q3_K: return 5;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_gcn(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ1_S: return 5;
|
||||
case GGML_TYPE_IQ1_M: return 5;
|
||||
case GGML_TYPE_IQ2_S: return 4;
|
||||
case GGML_TYPE_IQ2_XS: return 4;
|
||||
case GGML_TYPE_IQ2_XXS: return 4;
|
||||
case GGML_TYPE_IQ3_S: return 4;
|
||||
case GGML_TYPE_IQ3_XXS: return 4;
|
||||
case GGML_TYPE_IQ4_NL: return 6;
|
||||
case GGML_TYPE_IQ4_XS: return 4;
|
||||
case GGML_TYPE_Q2_K: return 4;
|
||||
case GGML_TYPE_Q3_K: return 4;
|
||||
case GGML_TYPE_Q4_0: return 5;
|
||||
case GGML_TYPE_Q4_1: return 5;
|
||||
case GGML_TYPE_Q4_K: return 4;
|
||||
case GGML_TYPE_Q5_K: return 4;
|
||||
case GGML_TYPE_Q6_K: return 4;
|
||||
case GGML_TYPE_Q8_0: return 4;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_cdna(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_S: return 5;
|
||||
case GGML_TYPE_IQ2_XS: return 5;
|
||||
case GGML_TYPE_IQ2_XXS: return 5;
|
||||
case GGML_TYPE_IQ3_S: return 4;
|
||||
case GGML_TYPE_IQ3_XXS: return 5;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna1_rdna2(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_S: return 4;
|
||||
case GGML_TYPE_IQ2_XS: return 4;
|
||||
case GGML_TYPE_IQ2_XXS: return 4;
|
||||
case GGML_TYPE_IQ3_S: return 4;
|
||||
case GGML_TYPE_IQ3_XXS: return 4;
|
||||
case GGML_TYPE_Q2_K: return 7;
|
||||
case GGML_TYPE_Q3_K: return 4;
|
||||
case GGML_TYPE_Q4_K: return 5;
|
||||
case GGML_TYPE_Q5_K: return 6;
|
||||
case GGML_TYPE_Q6_K: return 5;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna3(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ1_S: return 6;
|
||||
case GGML_TYPE_IQ1_M: return 6;
|
||||
case GGML_TYPE_IQ2_S: return 4;
|
||||
case GGML_TYPE_IQ2_XS: return 4;
|
||||
case GGML_TYPE_IQ2_XXS: return 4;
|
||||
case GGML_TYPE_IQ3_S: return 4;
|
||||
case GGML_TYPE_IQ3_XXS: return 4;
|
||||
case GGML_TYPE_IQ4_NL: return 6;
|
||||
case GGML_TYPE_IQ4_XS: return 6;
|
||||
case GGML_TYPE_Q4_K: return 4;
|
||||
case GGML_TYPE_Q5_K: return 4;
|
||||
case GGML_TYPE_Q6_K: return 4;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ1_S: return 7;
|
||||
case GGML_TYPE_IQ1_M: return 7;
|
||||
case GGML_TYPE_IQ2_S: return 4;
|
||||
case GGML_TYPE_IQ2_XS: return 4;
|
||||
case GGML_TYPE_IQ2_XXS: return 4;
|
||||
case GGML_TYPE_IQ3_S: return 4;
|
||||
case GGML_TYPE_IQ3_XXS: return 4;
|
||||
case GGML_TYPE_IQ4_NL: return 7;
|
||||
case GGML_TYPE_IQ4_XS: return 5;
|
||||
case GGML_TYPE_MXFP4: return 5;
|
||||
case GGML_TYPE_Q3_K: return 4;
|
||||
case GGML_TYPE_Q4_0: return 7;
|
||||
case GGML_TYPE_Q4_1: return 7;
|
||||
case GGML_TYPE_Q4_K: return 4;
|
||||
case GGML_TYPE_Q5_0: return 7;
|
||||
case GGML_TYPE_Q5_1: return 7;
|
||||
case GGML_TYPE_Q5_K: return 5;
|
||||
case GGML_TYPE_Q6_K: return 5;
|
||||
case GGML_TYPE_Q8_0: return 7;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
// Host function: returns the max batch size for the current arch+type at runtime.
|
||||
int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
|
||||
// NVIDIA: Volta, Ada Lovelace, and Blackwell always use MMVQ for MUL_MAT_ID.
|
||||
if (cc == GGML_CUDA_CC_VOLTA || cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_TURING) {
|
||||
return get_mmvq_mmid_max_batch_turing_plus(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
return get_mmvq_mmid_max_batch_pascal_older(type);
|
||||
}
|
||||
// AMD
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna4(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna3(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA2(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return get_mmvq_mmid_max_batch_cdna(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_GCN(cc)) {
|
||||
return get_mmvq_mmid_max_batch_gcn(type);
|
||||
}
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
// Device constexpr: returns the max batch size for the current arch+type at compile time.
|
||||
template <ggml_type type>
|
||||
static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
|
||||
#if defined(RDNA4)
|
||||
return get_mmvq_mmid_max_batch_rdna4(type);
|
||||
#elif defined(RDNA3)
|
||||
return get_mmvq_mmid_max_batch_rdna3(type);
|
||||
#elif defined(RDNA2) || defined(RDNA1)
|
||||
return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
|
||||
#elif defined(CDNA)
|
||||
return get_mmvq_mmid_max_batch_cdna(type);
|
||||
#elif defined(GCN)
|
||||
return get_mmvq_mmid_max_batch_gcn(type);
|
||||
#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || __CUDA_ARCH__ >= GGML_CUDA_CC_ADA_LOVELACE)
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
return get_mmvq_mmid_max_batch_turing_plus(type);
|
||||
#else
|
||||
return get_mmvq_mmid_max_batch_pascal_older(type);
|
||||
#endif
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||
switch (ncols_dst) {
|
||||
@@ -195,7 +383,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false, bool small_k = false>
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool small_k = false>
|
||||
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
||||
@@ -222,22 +410,13 @@ static __global__ void mul_mat_vec_q(
|
||||
|
||||
const uint32_t channel_dst = blockIdx.y;
|
||||
|
||||
uint32_t token_idx = 0;
|
||||
uint32_t channel_x;
|
||||
uint32_t channel_y;
|
||||
uint32_t sample_dst;
|
||||
|
||||
if constexpr (is_multi_token_id) {
|
||||
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
|
||||
token_idx = blockIdx.z;
|
||||
channel_x = ids[channel_dst + token_idx * ids_stride];
|
||||
channel_y = fastmodulo(channel_dst, nchannels_y);
|
||||
sample_dst = 0;
|
||||
} else {
|
||||
channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
|
||||
channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
|
||||
sample_dst = blockIdx.z;
|
||||
}
|
||||
channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
|
||||
channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
|
||||
sample_dst = blockIdx.z;
|
||||
|
||||
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
|
||||
const uint32_t sample_y = sample_dst;
|
||||
@@ -294,9 +473,6 @@ static __global__ void mul_mat_vec_q(
|
||||
float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
|
||||
|
||||
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
|
||||
if constexpr (is_multi_token_id) {
|
||||
y += token_idx*stride_col_y;
|
||||
}
|
||||
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
|
||||
|
||||
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
||||
@@ -350,10 +526,6 @@ static __global__ void mul_mat_vec_q(
|
||||
|
||||
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
|
||||
|
||||
if constexpr (is_multi_token_id) {
|
||||
dst += token_idx*stride_col_dst;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols_dst; ++j) {
|
||||
@@ -413,6 +585,69 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
}
|
||||
|
||||
// Dedicated MoE multi-token kernel.
|
||||
// Grid: (ceil(nrows_x / c_rows_per_block), nchannels_dst)
|
||||
// Block: (warp_size, ncols_dst) - each warp handles one token independently.
|
||||
// No shared memory reduction needed since each warp works alone.
|
||||
template <ggml_type type, int c_rows_per_block>
|
||||
__launch_bounds__(get_mmvq_mmid_max_batch_for_device<type>()*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q_moe(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids,
|
||||
float * __restrict__ dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
|
||||
const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
|
||||
const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
|
||||
const uint32_t ncols_dst, const uint32_t ids_stride) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||
|
||||
const uint32_t token_idx = threadIdx.y;
|
||||
const int row0 = c_rows_per_block*blockIdx.x;
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
constexpr int blocks_per_iter = vdr * warp_size / qi;
|
||||
|
||||
const uint32_t channel_dst = blockIdx.y;
|
||||
|
||||
if (token_idx >= ncols_dst) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride];
|
||||
const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y);
|
||||
|
||||
const block_q8_1 * y = ((const block_q8_1 *) vy) + channel_y*stride_channel_y + token_idx*stride_col_y;
|
||||
const int kbx_offset = channel_x*stride_channel_x + row0*stride_row_x;
|
||||
|
||||
// partial sum for each thread
|
||||
float tmp[c_rows_per_block] = {0.0f};
|
||||
|
||||
for (int kbx = threadIdx.x / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
||||
const int kby = kbx * (qk/QK8_1);
|
||||
const int kqs = vdr * (threadIdx.x % (qi/vdr));
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < c_rows_per_block; ++i) {
|
||||
tmp[i] += vec_dot_q_cuda(vx, &y[kby], kbx_offset + i*stride_row_x + kbx, kqs);
|
||||
}
|
||||
}
|
||||
|
||||
// Warp-level reduction only - no shared memory needed
|
||||
#pragma unroll
|
||||
for (int i = 0; i < c_rows_per_block; ++i) {
|
||||
tmp[i] = warp_reduce_sum<warp_size>(tmp[i]);
|
||||
}
|
||||
|
||||
// Write results
|
||||
if (threadIdx.x < c_rows_per_block && (c_rows_per_block == 1 || uint32_t(row0 + threadIdx.x) < nrows_x)) {
|
||||
dst[channel_dst*stride_channel_dst + token_idx*stride_col_dst + row0 + threadIdx.x] = tmp[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
template<ggml_type type>
|
||||
static std::pair<dim3, dim3> calc_launch_params(
|
||||
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
|
||||
@@ -425,7 +660,7 @@ static std::pair<dim3, dim3> calc_launch_params(
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false, bool small_k = false>
|
||||
template<ggml_type type, int c_ncols_dst, bool small_k = false>
|
||||
static void mul_mat_vec_q_switch_fusion(
|
||||
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||
@@ -438,7 +673,7 @@ static void mul_mat_vec_q_switch_fusion(
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
if constexpr (c_ncols_dst == 1) {
|
||||
if (has_fusion) {
|
||||
mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
mul_mat_vec_q<type, c_ncols_dst, true, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
@@ -448,12 +683,33 @@ static void mul_mat_vec_q_switch_fusion(
|
||||
|
||||
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
|
||||
|
||||
mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
mul_mat_vec_q<type, c_ncols_dst, false, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
static void mul_mat_vec_q_moe_launch(
|
||||
const void * vx, const void * vy, const int32_t * ids, float * dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
|
||||
const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
|
||||
const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
|
||||
const uint32_t ncols_dst, const uint32_t ids_stride,
|
||||
const int warp_size, const int nchannels_dst, cudaStream_t stream) {
|
||||
|
||||
constexpr int rows_per_block = 2; // 2 gives best perf based on tuning
|
||||
const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block;
|
||||
const dim3 block_nums(nblocks_rows, nchannels_dst);
|
||||
const dim3 block_dims(warp_size, ncols_dst);
|
||||
|
||||
mul_mat_vec_q_moe<type, rows_per_block><<<block_nums, block_dims, 0, stream>>>(
|
||||
vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x,
|
||||
stride_row_x, stride_col_y, stride_col_dst,
|
||||
stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
ncols_dst, ids_stride);
|
||||
}
|
||||
|
||||
template <ggml_type type>
|
||||
static void mul_mat_vec_q_switch_ncols_dst(
|
||||
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
|
||||
@@ -472,20 +728,62 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x);
|
||||
|
||||
const int device = ggml_cuda_get_device();
|
||||
const int cc = ggml_cuda_info().devices[device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[device].warp_size;
|
||||
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
|
||||
const mmvq_parameter_table_id table_id = get_device_table_id(cc);
|
||||
|
||||
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
|
||||
const bool has_ids = ids != nullptr;
|
||||
|
||||
const auto should_use_small_k = [&](int c_ncols_dst) {
|
||||
// When K is small, increase rows_per_block to match nwarps so each warp has more work to do
|
||||
// Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
const int blocks_per_iter_1warp = vdr * warp_size / qi;
|
||||
const int nwarps = calc_nwarps(type, c_ncols_dst, table_id);
|
||||
bool use = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
|
||||
|
||||
constexpr std::array<ggml_type, 2> iq_slow_turing = {
|
||||
GGML_TYPE_IQ3_XXS,
|
||||
GGML_TYPE_IQ3_S,
|
||||
};
|
||||
constexpr std::array<ggml_type, 8> iq_slow_other = {
|
||||
GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
||||
};
|
||||
constexpr std::array<ggml_type, 3> slow_pascal = {
|
||||
GGML_TYPE_IQ3_S,
|
||||
GGML_TYPE_Q2_K,
|
||||
GGML_TYPE_Q3_K,
|
||||
};
|
||||
|
||||
const bool is_nvidia_turing_plus = GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_TURING;
|
||||
const bool is_nvidia_pascal_older = GGML_CUDA_CC_IS_NVIDIA(cc) && cc < GGML_CUDA_CC_VOLTA;
|
||||
|
||||
if (is_nvidia_turing_plus) {
|
||||
if (ncols_dst == 1 &&
|
||||
std::find(iq_slow_turing.begin(), iq_slow_turing.end(), type) != iq_slow_turing.end()) {
|
||||
use = false;
|
||||
}
|
||||
} else if ((ncols_dst == 1 && std::find(iq_slow_other.begin(), iq_slow_other.end(), type) != iq_slow_other.end()) ||
|
||||
(is_nvidia_pascal_older && std::find(slow_pascal.begin(), slow_pascal.end(), type) != slow_pascal.end()) ||
|
||||
GGML_CUDA_CC_IS_RDNA(cc)) {
|
||||
use = false;
|
||||
}
|
||||
|
||||
return use;
|
||||
};
|
||||
|
||||
if (has_ids && ncols_dst > 1) {
|
||||
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
// Multi-token MUL_MAT_ID path - dedicated MoE kernel
|
||||
mul_mat_vec_q_moe_launch<type>(
|
||||
vx, vy, ids, dst, ncols_x, nchannels_y_fd, nrows_x,
|
||||
stride_row_x, stride_col_y, stride_col_dst,
|
||||
stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
ncols_dst, ids_stride, warp_size, nchannels_dst, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -493,31 +791,24 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
case 1: {
|
||||
constexpr int c_ncols_dst = 1;
|
||||
|
||||
// When K is small, increase rows_per_block to match nwarps so each warp has more work to do
|
||||
// Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
const int blocks_per_row_x = ncols_x / qk;
|
||||
const int blocks_per_iter_1warp = vdr * warp_size / qi;
|
||||
const int nwarps = calc_nwarps(type, c_ncols_dst, table_id);
|
||||
const bool use_small_k = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
|
||||
bool use_small_k = should_use_small_k(c_ncols_dst);
|
||||
|
||||
if (use_small_k) {
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst,
|
||||
warp_size, table_id, true);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, false, true>(
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst,
|
||||
nsamples_dst, warp_size, table_id, true);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(
|
||||
vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd,
|
||||
stride_sample_x, stride_sample_y, stride_sample_dst, dims.first, dims.second, 0, ids_stride,
|
||||
stream);
|
||||
} else {
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst,
|
||||
warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst,
|
||||
nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(
|
||||
vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
dims.first, dims.second, 0, ids_stride, stream);
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd,
|
||||
stride_sample_x, stride_sample_y, stride_sample_dst, dims.first, dims.second, 0, ids_stride,
|
||||
stream);
|
||||
}
|
||||
} break;
|
||||
case 2: {
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
#define MMVQ_MMID_MAX_BATCH_SIZE 4 // Max. batch size for which to use MMVQ kernels for MUL_MAT_ID
|
||||
|
||||
// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
|
||||
// based on the quantization type and GPU architecture (compute capability).
|
||||
int get_mmvq_mmid_max_batch(ggml_type type, int cc);
|
||||
|
||||
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
|
||||
|
||||
@@ -346,6 +346,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
|
||||
const HVX_Vector logit_cap = hvx_vec_splat_f32(factx->logit_softcap);
|
||||
|
||||
dma_cache m_cache;
|
||||
dma_cache_init(&m_cache, spad_m, factx->size_m_block, DMA_CACHE_MAX_SIZE);
|
||||
|
||||
for (uint32_t ir = ir0; ir < ir1; ++ir) {
|
||||
const uint32_t iq3 = fastdiv(ir, &factx->src0_div21);
|
||||
const uint32_t iq2 = fastdiv(ir - iq3*neq2*neq1, &factx->src0_div1);
|
||||
@@ -389,9 +392,8 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
// Mask
|
||||
if (mask) {
|
||||
const uint8_t * m_src = (const uint8_t *) (mp_base + ic_start);
|
||||
uint8_t * m_dst = spad_m + (ib % 2) * factx->size_m_block;
|
||||
// Mask is 1D contiguous for this row
|
||||
dma_queue_push(dma, dma_make_ptr(m_dst, m_src), current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
|
||||
dma_cache_push(dma, &m_cache, m_src, current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
|
||||
}
|
||||
|
||||
// FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
|
||||
@@ -554,7 +556,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
// Mask
|
||||
if (mask) {
|
||||
const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start);
|
||||
dma_queue_push(dma, dma_make_ptr(m_base, m_src), next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
|
||||
dma_cache_push(dma, &m_cache, m_src, next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
|
||||
}
|
||||
|
||||
// FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u : iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
|
||||
@@ -684,7 +686,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
octx->src0_spad.size_per_thread = size_q_block * 1;
|
||||
octx->src1_spad.size_per_thread = factx.size_k_block * 2;
|
||||
octx->src2_spad.size_per_thread = factx.size_v_block * 2;
|
||||
octx->src3_spad.size_per_thread = mask ? factx.size_m_block * 2 : 0;
|
||||
octx->src3_spad.size_per_thread = mask ? factx.size_m_block * DMA_CACHE_MAX_SIZE : 0;
|
||||
octx->dst_spad.size_per_thread = size_vkq_acc;
|
||||
|
||||
octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
|
||||
@@ -705,6 +707,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size;
|
||||
octx->dst_spad.data = octx->src3_spad.data + octx->src3_spad.size;
|
||||
|
||||
// FARF(ERROR, "fa: qrows-per-thread %u", factx.qrows_per_thread);
|
||||
|
||||
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, flash_attn_ext_f16_thread, &factx, octx->n_threads);
|
||||
}
|
||||
|
||||
@@ -143,7 +143,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
|
||||
desc->desc_size = 0; // 1D mode
|
||||
desc->src_bypass = dma_src_l2_bypass_on;
|
||||
desc->dst_bypass = dma_dst_l2_bypass_on;
|
||||
desc->order = 1;
|
||||
desc->order = 0;
|
||||
desc->done = 0;
|
||||
desc->src = (void *) dptr.src;
|
||||
desc->dst = (void *) dptr.dst;
|
||||
@@ -151,8 +151,12 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
|
||||
|
||||
q->dptr[q->push_idx] = dptr;
|
||||
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = (dma_descriptor_2d *) desc;
|
||||
if (size) {
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = (dma_descriptor_2d *) desc;
|
||||
} else {
|
||||
desc->done = 1;
|
||||
}
|
||||
|
||||
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
|
||||
q->push_idx = (q->push_idx + 1) & q->idx_mask;
|
||||
@@ -175,7 +179,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
|
||||
desc->dst_bypass = dma_dst_l2_bypass_on;
|
||||
desc->src_comp = 0;
|
||||
desc->dst_comp = 0;
|
||||
desc->order = 1;
|
||||
desc->order = 0;
|
||||
desc->done = 0;
|
||||
desc->src_stride = src_stride;
|
||||
desc->dst_stride = dst_stride;
|
||||
@@ -197,8 +201,12 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
|
||||
|
||||
q->dptr[q->push_idx] = dptr;
|
||||
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = desc;
|
||||
if (nrows) {
|
||||
dmlink(q->tail, desc);
|
||||
q->tail = desc;
|
||||
} else {
|
||||
desc->done = 1;
|
||||
}
|
||||
|
||||
// FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
|
||||
q->push_idx = (q->push_idx + 1) & q->idx_mask;
|
||||
@@ -215,12 +223,9 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
|
||||
dma_descriptor_2d * desc = &q->desc[q->pop_idx];
|
||||
|
||||
// Wait for desc to complete
|
||||
while (1) {
|
||||
dmpoll();
|
||||
if (desc->done) {
|
||||
break;
|
||||
}
|
||||
while (!desc->done) {
|
||||
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
|
||||
dmpoll();
|
||||
}
|
||||
|
||||
dptr = q->dptr[q->pop_idx];
|
||||
@@ -312,6 +317,54 @@ static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, dma_ptr dptr, size_
|
||||
return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
|
||||
}
|
||||
|
||||
#define DMA_CACHE_MAX_SIZE 64U
|
||||
|
||||
typedef struct {
|
||||
uint8_t *base;
|
||||
uint32_t line_size;
|
||||
uint32_t capacity;
|
||||
uint32_t src[DMA_CACHE_MAX_SIZE];
|
||||
uint16_t age[DMA_CACHE_MAX_SIZE];
|
||||
} dma_cache;
|
||||
|
||||
static inline void dma_cache_init(dma_cache *c, uint8_t *base, uint32_t line_size, uint32_t capacity)
|
||||
{
|
||||
c->capacity = (capacity > DMA_CACHE_MAX_SIZE) ? DMA_CACHE_MAX_SIZE : capacity;
|
||||
c->base = base;
|
||||
c->line_size = line_size;
|
||||
|
||||
for (unsigned i=0; i < c->capacity; i++) {
|
||||
c->src[i] = 0;
|
||||
c->age[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool dma_cache_push(dma_queue *q, dma_cache *c, const uint8_t * src, uint32_t dst_stride, uint32_t src_stride, uint32_t row_size, uint32_t nrows)
|
||||
{
|
||||
uint32_t o_idx = 0;
|
||||
uint16_t o_age = 0;
|
||||
uint8_t * dst = 0;
|
||||
|
||||
for (unsigned i=0; i < c->capacity; i++) {
|
||||
if (c->src[i] == (uint32_t) src) {
|
||||
c->age[i] = 0;
|
||||
dst = c->base + (i * c->line_size); nrows = 0; // dummy dma
|
||||
// FARF(ERROR, "dma-cache: found %p", src);
|
||||
} else {
|
||||
c->age[i]++;
|
||||
if (c->age[i] > o_age) { o_age = c->age[i]; o_idx = i; }
|
||||
}
|
||||
}
|
||||
if (!dst) {
|
||||
// FARF(ERROR, "dma-cache: replacing #%u : age %u %p -> %p", o_idx, c->age[o_idx], (void *) c->src[o_idx], src);
|
||||
c->age[o_idx] = 0;
|
||||
c->src[o_idx] = (uint32_t) src;
|
||||
dst = c->base + o_idx * c->line_size; // normal nrows dma
|
||||
}
|
||||
|
||||
return dma_queue_push(q, dma_make_ptr(dst, src), dst_stride, src_stride, row_size, nrows);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
@@ -333,8 +333,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
|
||||
}
|
||||
|
||||
// Skip DMA transactions from prev block (if any)
|
||||
// No need to wait for these since the DMA is setup for in-order processing
|
||||
// Skip output DMA transactions from prev block (if any)
|
||||
// No need to wait for those here since we're explicitly waiting for the latest prefecthes below.
|
||||
for (uint32_t d=0; d < dma_depth; d++) { dma_queue_pop_nowait(dma_queue); }
|
||||
|
||||
// Compute loop
|
||||
|
||||
@@ -14,12 +14,12 @@ except ImportError:
|
||||
SentencePieceProcessor: Any = None
|
||||
|
||||
try:
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found, ty:unresolved-import]
|
||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
|
||||
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
|
||||
_filter_valid_tokenizer_files,
|
||||
)
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
|
||||
SentencePieceTokenizer,
|
||||
)
|
||||
except ImportError:
|
||||
@@ -32,7 +32,7 @@ else:
|
||||
_mistral_common_installed = True
|
||||
|
||||
try:
|
||||
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
|
||||
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
|
||||
get_one_valid_tokenizer_file,
|
||||
)
|
||||
except ImportError:
|
||||
|
||||
@@ -147,7 +147,7 @@ ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd
|
||||
for codepoint, norm in table_nfd:
|
||||
start = ranges_nfd[-1][0]
|
||||
if ranges_nfd[-1] != (start, codepoint - 1, norm):
|
||||
ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below
|
||||
ranges_nfd.append((0, 0, 0)) # dummy, will be replaced below
|
||||
start = codepoint
|
||||
ranges_nfd[-1] = (start, codepoint, norm)
|
||||
|
||||
|
||||
@@ -557,6 +557,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_ROPE_FREQS,
|
||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
|
||||
@@ -1158,6 +1158,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
|
||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
||||
if (use_mmap) {
|
||||
static std::once_flag once;
|
||||
std::call_once(once, [] {
|
||||
LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
|
||||
});
|
||||
}
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -32,13 +32,22 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
|
||||
|
||||
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
|
||||
|
||||
std::map<std::string, std::string> headers;
|
||||
for (auto [key, value] : req.headers) {
|
||||
auto new_key = key;
|
||||
if (string_starts_with(new_key, "X-Proxy-Header-")) {
|
||||
string_replace_all(new_key, "X-Proxy-Header-", "");
|
||||
}
|
||||
headers[new_key] = value;
|
||||
}
|
||||
|
||||
auto proxy = std::make_unique<server_http_proxy>(
|
||||
method,
|
||||
parsed_url.scheme,
|
||||
parsed_url.host,
|
||||
parsed_url.port,
|
||||
parsed_url.path,
|
||||
req.headers,
|
||||
headers,
|
||||
req.body,
|
||||
req.should_stop,
|
||||
600, // timeout_read (default to 10 minutes)
|
||||
|
||||
@@ -35,7 +35,7 @@ using server_http_res_ptr = std::unique_ptr<server_http_res>;
|
||||
|
||||
struct server_http_req {
|
||||
std::map<std::string, std::string> params; // path_params + query_params
|
||||
std::map<std::string, std::string> headers; // reserved for future use
|
||||
std::map<std::string, std::string> headers; // used by MCP proxy
|
||||
std::string path;
|
||||
std::string query_string; // query parameters string (e.g. "action=save")
|
||||
std::string body;
|
||||
|
||||
@@ -116,7 +116,7 @@ class ServerProcess:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
self.external_server = "DEBUG_EXTERNAL" in os.environ
|
||||
|
||||
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
if self.external_server:
|
||||
print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
|
||||
return
|
||||
|
||||
@@ -39,7 +39,13 @@ import type {
|
||||
MCPResourceContent,
|
||||
MCPReadResourceResult
|
||||
} from '$lib/types';
|
||||
import { buildProxiedUrl, throwIfAborted, isAbortError, createBase64DataUrl } from '$lib/utils';
|
||||
import {
|
||||
buildProxiedUrl,
|
||||
buildProxiedHeaders,
|
||||
throwIfAborted,
|
||||
isAbortError,
|
||||
createBase64DataUrl
|
||||
} from '$lib/utils';
|
||||
|
||||
interface ToolResultContentItem {
|
||||
type: string;
|
||||
@@ -118,7 +124,7 @@ export class MCPService {
|
||||
const requestInit: RequestInit = {};
|
||||
|
||||
if (config.headers) {
|
||||
requestInit.headers = config.headers;
|
||||
requestInit.headers = buildProxiedHeaders(config.headers);
|
||||
}
|
||||
|
||||
if (config.credentials) {
|
||||
|
||||
@@ -19,6 +19,21 @@ export function buildProxiedUrl(targetUrl: string): URL {
|
||||
return proxyUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap original headers for proxying through the CORS proxy. This avoids issues with duplicated llama.cpp-specific and target headers when using the CORS proxy.
|
||||
* @param headers - The original headers to be proxied to target
|
||||
* @returns List of "wrapped" headers to be sent to the CORS proxy
|
||||
*/
|
||||
export function buildProxiedHeaders(headers: Record<string, string>): Record<string, string> {
|
||||
const proxiedHeaders: Record<string, string> = {};
|
||||
|
||||
for (const [key, value] of Object.entries(headers)) {
|
||||
proxiedHeaders[`X-Proxy-Header-${key}`] = value;
|
||||
}
|
||||
|
||||
return proxiedHeaders;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a proxied URL string for use in fetch requests.
|
||||
* @param targetUrl - The original URL to proxy
|
||||
|
||||
@@ -38,7 +38,7 @@ export { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } fr
|
||||
export { setConfigValue, getConfigValue, configToParameterRecord } from './config-helpers';
|
||||
|
||||
// CORS Proxy
|
||||
export { buildProxiedUrl, getProxiedUrlString } from './cors-proxy';
|
||||
export { buildProxiedUrl, getProxiedUrlString, buildProxiedHeaders } from './cors-proxy';
|
||||
|
||||
// Conversation utilities
|
||||
export { createMessageCountMap, getMessageCount } from './conversation-utils';
|
||||
|
||||
Reference in New Issue
Block a user