mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
21 Commits
b7310
...
gg/cast-re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a615b27e4 | ||
|
|
0cdce38a97 | ||
|
|
e39502e74b | ||
|
|
1d2a1ab73d | ||
|
|
c8554b66e0 | ||
|
|
2fa51c19b0 | ||
|
|
951520ddb0 | ||
|
|
68522c678d | ||
|
|
f896d2c34f | ||
|
|
e4e9c4329c | ||
|
|
636fc17a37 | ||
|
|
51e0c2d917 | ||
|
|
37a4f63244 | ||
|
|
2bc96931d2 | ||
|
|
5814b4dce1 | ||
|
|
79d61896d3 | ||
|
|
4d3726278b | ||
|
|
08f9d3cc1d | ||
|
|
0a540f9abd | ||
|
|
22577583a3 | ||
|
|
d9e03db1e7 |
31
.github/actions/windows-setup-cuda/action.yml
vendored
31
.github/actions/windows-setup-cuda/action.yml
vendored
@@ -65,3 +65,34 @@ runs:
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 13.1
|
||||
if: ${{ inputs.cuda_version == '13.1' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
1
.github/workflows/build-linux-cross.yml
vendored
1
.github/workflows/build-linux-cross.yml
vendored
@@ -291,6 +291,7 @@ jobs:
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
||||
|
||||
|
||||
8
.github/workflows/release.yml
vendored
8
.github/workflows/release.yml
vendored
@@ -421,7 +421,7 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4']
|
||||
cuda: ['12.4', '13.1']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -476,6 +476,7 @@ jobs:
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
@@ -545,6 +546,8 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
@@ -835,7 +838,8 @@ jobs:
|
||||
**Windows:**
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
- [Windows x64 (CUDA)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
@@ -61,7 +61,7 @@ range of hardware - locally and in the cloud.
|
||||
- Plain C/C++ implementation without any dependencies
|
||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||
- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
|
||||
- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
||||
- Vulkan and SYCL backend support
|
||||
|
||||
@@ -708,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.use_jinja = true;
|
||||
}
|
||||
|
||||
params.use_color = tty_can_use_colors();
|
||||
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
@@ -790,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"-co", "--color"},
|
||||
string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.use_color = true;
|
||||
{"-co", "--color"}, "[on|off|auto]",
|
||||
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
"'auto' enables colors when output is to a terminal",
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (is_truthy(value)) {
|
||||
params.use_color = true;
|
||||
} else if (is_falsey(value)) {
|
||||
params.use_color = false;
|
||||
} else if (is_autoy(value)) {
|
||||
params.use_color = tty_can_use_colors();
|
||||
} else {
|
||||
throw std::invalid_argument(
|
||||
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
add_opt(common_arg(
|
||||
@@ -1022,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
||||
string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
|
||||
}
|
||||
}).set_env("LLAMA_ARG_FLASH_ATTN"));
|
||||
add_opt(common_arg(
|
||||
@@ -2696,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
||||
} else {
|
||||
throw std::invalid_argument(
|
||||
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
|
||||
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_LOG_COLORS"));
|
||||
|
||||
@@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
||||
if (reasoning_unclosed) {
|
||||
if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
|
||||
unclosed_reasoning_content += content;
|
||||
if (form.allow_toolcall_in_think) {
|
||||
builder.move_to(tc->groups[0].begin);
|
||||
if (!builder.try_consume_xml_tool_calls(form)) {
|
||||
unclosed_reasoning_content += tool_call_start;
|
||||
builder.move_to(tc->groups[0].end);
|
||||
}
|
||||
} else {
|
||||
if (!(form.allow_toolcall_in_think && tc)) {
|
||||
unclosed_reasoning_content += tool_call_start;
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
reasoning_unclosed = false;
|
||||
std::string reasoning_content;
|
||||
@@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
||||
}
|
||||
} else {
|
||||
// This <tool_call> start is in thinking block, skip this tool call
|
||||
auto pos = think_start + start_think.size();
|
||||
unclosed_reasoning_content = content.substr(pos) + tool_call_start;
|
||||
// This <tool_call> start is in thinking block
|
||||
if (form.allow_toolcall_in_think) {
|
||||
unclosed_reasoning_content = content.substr(think_start + start_think.size());
|
||||
} else {
|
||||
unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
|
||||
}
|
||||
reasoning_unclosed = true;
|
||||
content.resize(think_start);
|
||||
toolcall_in_think = true;
|
||||
@@ -805,14 +803,35 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
||||
}
|
||||
|
||||
// remove potential partial suffix
|
||||
if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
|
||||
rstrip(content);
|
||||
trim_potential_partial_word(content);
|
||||
rstrip(content);
|
||||
if (builder.pos() == builder.input().size()) {
|
||||
if (unclosed_reasoning_content.empty()) {
|
||||
rstrip(content);
|
||||
trim_potential_partial_word(content);
|
||||
rstrip(content);
|
||||
} else {
|
||||
rstrip(unclosed_reasoning_content);
|
||||
trim_potential_partial_word(unclosed_reasoning_content);
|
||||
rstrip(unclosed_reasoning_content);
|
||||
}
|
||||
}
|
||||
|
||||
// consume unclosed_reasoning_content if allow_toolcall_in_think is set
|
||||
if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
|
||||
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
|
||||
builder.add_reasoning_content(unclosed_reasoning_content);
|
||||
} else {
|
||||
if (content.empty()) {
|
||||
content = start_think + unclosed_reasoning_content;
|
||||
} else {
|
||||
content += "\n\n" + start_think;
|
||||
content += unclosed_reasoning_content;
|
||||
}
|
||||
}
|
||||
unclosed_reasoning_content.clear();
|
||||
}
|
||||
|
||||
// Add content
|
||||
if (content.size() != 0) {
|
||||
if (!content.empty()) {
|
||||
// If there are multiple content blocks
|
||||
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
|
||||
builder.add_content("\n\n");
|
||||
@@ -820,7 +839,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
||||
builder.add_content(content);
|
||||
}
|
||||
|
||||
// This <tool_call> start is in thinking block, skip this tool call
|
||||
// This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
|
||||
if (toolcall_in_think && !form.allow_toolcall_in_think) {
|
||||
continue;
|
||||
}
|
||||
@@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
||||
if (!tc) {
|
||||
GGML_ASSERT(builder.pos() == builder.input().size());
|
||||
GGML_ASSERT(unclosed_reasoning_content.empty());
|
||||
GGML_ASSERT(!reasoning_unclosed);
|
||||
if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
|
||||
|
||||
/**
|
||||
* Parse content uses reasoning and XML-Style tool call
|
||||
* TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
|
||||
*/
|
||||
void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
|
||||
parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
|
||||
|
||||
@@ -31,7 +31,7 @@ struct xml_tool_call_format {
|
||||
std::optional<std::string> last_val_end = std::nullopt;
|
||||
std::optional<std::string> last_tool_end = std::nullopt;
|
||||
bool trim_raw_argval = false;
|
||||
bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
|
||||
bool allow_toolcall_in_think = false;
|
||||
};
|
||||
|
||||
// make a GBNF that accept any strings except those containing any of the forbidden strings.
|
||||
|
||||
@@ -917,12 +917,13 @@ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
|
||||
form.tool_start = "<|tool_call_begin|>";
|
||||
form.tool_sep = "<|tool_call_argument_begin|>{";
|
||||
form.key_start = "\"";
|
||||
form.key_val_sep = "\": ";
|
||||
form.val_end = ", ";
|
||||
form.key_val_sep = "\":";
|
||||
form.val_end = ",";
|
||||
form.tool_end = "}<|tool_call_end|>";
|
||||
form.scope_end = "<|tool_calls_section_end|>";
|
||||
form.raw_argval = false;
|
||||
form.last_val_end = "";
|
||||
form.allow_toolcall_in_think = true;
|
||||
return form;
|
||||
})();
|
||||
builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
|
||||
|
||||
@@ -982,6 +982,32 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
|
||||
return files;
|
||||
}
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
bool tty_can_use_colors() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
//
|
||||
// Model utils
|
||||
|
||||
@@ -655,6 +655,13 @@ struct common_file_info {
|
||||
};
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
// Auto-detect if colors can be enabled based on terminal and environment
|
||||
bool tty_can_use_colors();
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
@@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
// Auto-detect if colors should be enabled based on terminal and environment
|
||||
static bool common_log_should_use_colors_auto() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
static int64_t t_us() {
|
||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
@@ -391,7 +368,7 @@ struct common_log * common_log_main() {
|
||||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, [&]() {
|
||||
// Set default to auto-detect colors
|
||||
log.set_colors(common_log_should_use_colors_auto());
|
||||
log.set_colors(tty_can_use_colors());
|
||||
});
|
||||
|
||||
return &log;
|
||||
@@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
||||
|
||||
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||
if (colors == LOG_COLORS_AUTO) {
|
||||
log->set_colors(common_log_should_use_colors_auto());
|
||||
log->set_colors(tty_can_use_colors());
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -5825,9 +5825,11 @@ class Gemma3Model(TextModel):
|
||||
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
self._set_vocab_sentencepiece()
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
else:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
@@ -5845,13 +5847,24 @@ class Gemma3Model(TextModel):
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
|
||||
# attn_logit_softcapping is removed in Gemma3
|
||||
assert hparams.get("attn_logit_softcapping") is None
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
if (final_logit_softcap := hparams.get("final_logit_softcapping")):
|
||||
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
|
||||
if hparams.get("sliding_window_pattern") != 1:
|
||||
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
|
||||
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
|
||||
if hparams.get("rope_scaling") is not None:
|
||||
assert hparams["rope_scaling"]["rope_type"] == "linear"
|
||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
rope_scaling = hparams["rope_scaling"]
|
||||
if rope_scaling["rope_type"] == "linear":
|
||||
# important: this rope_scaling is only applied for global layers, and not used by 1B model
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
elif rope_scaling["rope_type"] == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
@@ -5865,8 +5878,10 @@ class Gemma3Model(TextModel):
|
||||
|
||||
# remove OOV (out-of-vocabulary) rows in token_embd
|
||||
if "embed_tokens.weight" in name:
|
||||
vocab = self._create_vocab_sentencepiece()
|
||||
tokens = vocab[0]
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
tokens = self._create_vocab_sentencepiece()[0]
|
||||
else:
|
||||
tokens = self.get_vocab_base()[0]
|
||||
data_torch = data_torch[:len(tokens)]
|
||||
|
||||
# ref code in Gemma3RMSNorm
|
||||
|
||||
@@ -19,6 +19,7 @@ cmake -B build \
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=build/installed
|
||||
|
||||
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
std::string s(buf, n);
|
||||
printf("%s", s.c_str());
|
||||
printf("%s (%d)", s.c_str(), id);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
||||
@@ -168,6 +168,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
||||
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
||||
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
||||
option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
|
||||
|
||||
|
||||
@@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_RV_ZICBOP)
|
||||
string(APPEND MARCH_STR "_zicbop")
|
||||
endif()
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
else()
|
||||
# Begin with the lowest baseline
|
||||
|
||||
@@ -490,6 +490,15 @@ static inline void ggml_thread_cpu_relax(void) {
|
||||
static inline void ggml_thread_cpu_relax(void) {
|
||||
_mm_pause();
|
||||
}
|
||||
#elif defined(__riscv)
|
||||
static inline void ggml_thread_cpu_relax(void) {
|
||||
#ifdef __riscv_zihintpause
|
||||
__asm__ __volatile__ ("pause");
|
||||
#else
|
||||
/* Encoding of the pause instruction */
|
||||
__asm__ __volatile__ (".4byte 0x100000F");
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
static inline void ggml_thread_cpu_relax(void) {;}
|
||||
#endif
|
||||
|
||||
@@ -564,6 +564,12 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
|
||||
const int i_KQ = i_KQ_0 + (threadIdx.y % np)*warp_size + threadIdx.x;
|
||||
|
||||
#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
// Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
|
||||
// Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
|
||||
KQ_acc[i_KQ_0/(np*warp_size)*cpw + jc0] *= 4.0f;
|
||||
#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
|
||||
if (use_logit_softcap) {
|
||||
KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] = logit_softcap * tanhf(KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
|
||||
}
|
||||
@@ -858,6 +864,11 @@ static __global__ void flash_attn_tile(
|
||||
#pragma unroll
|
||||
for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
|
||||
tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
|
||||
#if defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
// Without the v_dot2_f32_f16 instruction there is a higher risk of numerical overflow in the KQ calculation.
|
||||
// Therefore, scale down Q values and apply the inverse scale the FP32 KQ values afterwards again.
|
||||
tmp_h2[i1/2] *= make_half2(0.25f, 0.25f);
|
||||
#endif // defined(FAST_FP16_AVAILABLE) && !defined(V_DOT2_F32_F16_AVAILABLE)
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
||||
&Q_tmp[jc*(DKQ/2) + i0/2 + (threadIdx.y % np)*(warp_size*cpy_ne_D/2) + threadIdx.x*(cpy_ne_D/2)],
|
||||
|
||||
37
ggml/src/ggml-cuda/fill.cu
Normal file
37
ggml/src/ggml-cuda/fill.cu
Normal file
@@ -0,0 +1,37 @@
|
||||
#include "fill.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
#define CUDA_FILL_BLOCK_SIZE 256
|
||||
|
||||
template <typename T>
|
||||
static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
|
||||
const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = value;
|
||||
}
|
||||
|
||||
void ggml_cuda_op_fill(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
void * dst_d = dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
float value;
|
||||
memcpy(&value, dst->op_params, sizeof(float));
|
||||
|
||||
const int64_t k = ggml_nelements(dst);
|
||||
const int64_t num_blocks = (k + CUDA_FILL_BLOCK_SIZE - 1) / CUDA_FILL_BLOCK_SIZE;
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
fill_kernel<<<num_blocks, CUDA_FILL_BLOCK_SIZE, 0, stream>>>((float *)dst_d, k, value);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
fill_kernel<<<num_blocks, CUDA_FILL_BLOCK_SIZE, 0, stream>>>((half *)dst_d, k, ggml_cuda_cast<half>(value));
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported type");
|
||||
}
|
||||
}
|
||||
3
ggml/src/ggml-cuda/fill.cuh
Normal file
3
ggml/src/ggml-cuda/fill.cuh
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_fill(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -56,6 +56,7 @@
|
||||
#include "ggml-cuda/solve_tri.cuh"
|
||||
#include "ggml-cuda/tri.cuh"
|
||||
#include "ggml-cuda/cumsum.cuh"
|
||||
#include "ggml-cuda/fill.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -2730,6 +2731,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
ggml_cuda_op_solve_tri(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_FILL:
|
||||
ggml_cuda_op_fill(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -4617,6 +4621,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
case GGML_OP_FILL:
|
||||
case GGML_OP_CUMSUM:
|
||||
case GGML_OP_TRI:
|
||||
return true;
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
#include "solve_tri.cuh"
|
||||
|
||||
#define MAX_N_FAST 64
|
||||
#define MAX_K_FAST 32
|
||||
|
||||
// ======================
|
||||
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
||||
@@ -48,65 +47,58 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
||||
float * X_batch = (float *) (X + i02 * nb2 + i03 * nb3);
|
||||
|
||||
__shared__ float sA[MAX_N_FAST * MAX_N_FAST];
|
||||
__shared__ float sXt[MAX_N_FAST * (MAX_K_FAST + 1)];
|
||||
|
||||
const int offset = threadIdx.x + threadIdx.y * blockDim.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < n * n; i += k * WARP_SIZE) {
|
||||
int i0 = i + offset;
|
||||
const int i0 = i + offset;
|
||||
if (i0 < n * n) {
|
||||
sA[i0] = A_batch[i0];
|
||||
}
|
||||
}
|
||||
|
||||
const int rows_per_warp = (n + WARP_SIZE - 1) / WARP_SIZE;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rows_per_warp; i++) {
|
||||
const int i0 = lane + i * WARP_SIZE;
|
||||
if (i0 < n) {
|
||||
sXt[col_idx * n + i0] = B_batch[i0 * k + col_idx];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
|
||||
float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
|
||||
|
||||
const int half = WARP_SIZE;
|
||||
const int nrows_low = (n < half) ? n : half;
|
||||
|
||||
#pragma unroll
|
||||
for (int row = 0; row < n; ++row) {
|
||||
for (int row = 0; row < nrows_low; ++row) {
|
||||
float sum = 0.0f;
|
||||
|
||||
{
|
||||
int j = lane;
|
||||
if (j < row) {
|
||||
sum += sA[row * n + j] * sXt[col_idx * n + j];
|
||||
}
|
||||
if (lane < row) {
|
||||
sum += sA[row * n + lane] * x_low;
|
||||
}
|
||||
if (row >= WARP_SIZE) {
|
||||
int j = WARP_SIZE + lane;
|
||||
if (j < row) {
|
||||
sum += sA[row * n + j] * sXt[col_idx * n + j];
|
||||
}
|
||||
}
|
||||
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (lane == 0) {
|
||||
const float b_val = sXt[col_idx * n + row];
|
||||
const float a_diag = sA[row * n + row];
|
||||
// no safeguards for division by zero because that indicates corrupt
|
||||
// data anyway
|
||||
sXt[col_idx * n + row] = (b_val - sum) / a_diag;
|
||||
if (lane == row) {
|
||||
x_low = (x_low - sum) / sA[row * n + row];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (int row = half; row < n; ++row) {
|
||||
float sum = sA[row * n + lane] * x_low;
|
||||
const int j = half + lane;
|
||||
if (j < row) {
|
||||
sum += sA[row * n + j] * x_high;
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (lane == row - half) {
|
||||
x_high = (x_high - sum) / sA[row * n + row];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < rows_per_warp; i++) {
|
||||
const int i0 = lane + i * WARP_SIZE;
|
||||
if (i0 < n) {
|
||||
X_batch[i0 * k + col_idx] = sXt[col_idx * n + i0];
|
||||
for (int rr = 0; rr < 2; ++rr) {
|
||||
const int row = rr * WARP_SIZE + lane;
|
||||
if (row < n) {
|
||||
const float val = (row < half) ? x_low : x_high;
|
||||
X_batch[row * k + col_idx] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,13 @@
|
||||
#include "dequantize.hpp"
|
||||
#include "presets.hpp"
|
||||
|
||||
#if defined(__INTEL_LLVM_COMPILER)
|
||||
#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
|
||||
#include <sycl/ext/oneapi/bfloat16.hpp>
|
||||
#define GGML_SYCL_HAS_BF16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -566,6 +573,10 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
return dequantize_row_iq4_nl_sycl;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_sycl<float>;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -627,6 +638,10 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return dequantize_row_iq4_nl_sycl;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_sycl<sycl::half>;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -636,6 +651,10 @@ to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_nc_sycl<float>;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -7,35 +7,85 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
|
||||
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
// Compute starting index in matrix B for this superblock
|
||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
|
||||
// Precompute indices for quantization lookup tables
|
||||
const uint qh_base = 2 * ib32;
|
||||
const uint qs_base = 4 * ib32;
|
||||
const uint sc_index = ib32 / 2;
|
||||
const uint sc_shift = 6 * (ib32 & 1);
|
||||
|
||||
// Loop over rows in the superblock
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
// Load per-block scales and shift for quantization
|
||||
const uint16_t[4] scales = data_a[ibi].scales;
|
||||
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
|
||||
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
|
||||
const uint sc = data_a[ibi].scales[sc_index] >> sc_shift;
|
||||
|
||||
const uint sc = data_a[ibi].scales[ib32 / 2] >> (6 * (ib32 & 1));
|
||||
// Temporary caches for decoding
|
||||
FLOAT_TYPE dl_cache[4];
|
||||
uint16_t gvf_cache[4];
|
||||
float delta_cache[4];
|
||||
|
||||
// Precompute the multiplier and lookup values for 4 sub-blocks
|
||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||
const uint qh = data_a[ibi].qh[2 * ib32 + l / 2] >> (4 * (l&1));
|
||||
const uint qs = data_a[ibi].qs[4 * ib32 + l];
|
||||
const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
|
||||
const float dl = d * (2 * bitfieldExtract(sc, 3 * int(l / 2), 3) + 1);
|
||||
dl_cache[l] = FLOAT_TYPE(d * (2 * bitfieldExtract(sc, 3 * int(l / 2), 3) + 1));
|
||||
const uint qh = data_a[ibi].qh[qh_base + l / 2] >> (4 * (l & 1));
|
||||
const uint qs = data_a[ibi].qs[qs_base + l];
|
||||
gvf_cache[l] = iq1s_grid[qs | ((qh & 7) << 8)];
|
||||
delta_cache[l] = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
|
||||
}
|
||||
|
||||
const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
|
||||
// Loop over columns of the output
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
// Compute base index for matrix B
|
||||
const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx) / 4;
|
||||
vec4 b_vals[8];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int k = 0; k < 4; ++k) {
|
||||
sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
|
||||
fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
|
||||
}
|
||||
temp[j][n] = fma(dl, sum, temp[j][n]);
|
||||
// Load 8 vec4 values from matrix B
|
||||
[[unroll]] for (int idx = 0; idx < 8; ++idx) {
|
||||
b_vals[idx] = vec4(data_b_v4[base_b_idx + idx]);
|
||||
}
|
||||
|
||||
FLOAT_TYPE col_sum = FLOAT_TYPE(0.0);
|
||||
|
||||
// Loop over sub-blocks
|
||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||
const uint16_t grid = gvf_cache[l];
|
||||
const float dl = dl_cache[l];
|
||||
|
||||
// Decode 8 2-bit fbits from gvf_cache
|
||||
float f0 = float(bitfieldExtract(grid, 0, 2));
|
||||
float f1 = float(bitfieldExtract(grid, 2, 2));
|
||||
float f2 = float(bitfieldExtract(grid, 4, 2));
|
||||
float f3 = float(bitfieldExtract(grid, 6, 2));
|
||||
float f4 = float(bitfieldExtract(grid, 8, 2));
|
||||
float f5 = float(bitfieldExtract(grid, 10, 2));
|
||||
float f6 = float(bitfieldExtract(grid, 12, 2));
|
||||
float f7 = float(bitfieldExtract(grid, 14, 2));
|
||||
|
||||
// Pack into vec4 for vectorized FMA
|
||||
const vec4 fbits_v0 = vec4(f0, f1, f2, f3);
|
||||
const vec4 fbits_v1 = vec4(f4, f5, f6, f7);
|
||||
const vec4 delta_v = vec4(delta_cache[l]);
|
||||
|
||||
// Vectorized fused multiply-add
|
||||
vec4 sum_v = fma(b_vals[2*l + 0], fbits_v0 + delta_v, vec4(0.0));
|
||||
sum_v = fma(b_vals[2*l + 1], fbits_v1 + delta_v, sum_v);
|
||||
|
||||
// Horizontal add to get scalar sum
|
||||
FLOAT_TYPE sum = sum_v.x + sum_v.y + sum_v.z + sum_v.w;
|
||||
|
||||
// Accumulate to column sum
|
||||
col_sum = fma(dl, sum, col_sum);
|
||||
}
|
||||
// Write result to temporary buffer
|
||||
temp[j][n] += col_sum;
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
|
||||
@@ -3418,7 +3418,6 @@ struct ggml_tensor * ggml_cast(
|
||||
|
||||
result->op = GGML_OP_CPY;
|
||||
result->src[0] = a;
|
||||
result->src[1] = result;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -67,6 +67,30 @@ Parentheses `()` can be used to group sequences, which allows for embedding alte
|
||||
- `{m,n}` repeats the precedent symbol or sequence at between `m` and `n` times (included)
|
||||
- `{0,n}` repeats the precedent symbol or sequence at most `n` times (included)
|
||||
|
||||
## Tokens
|
||||
|
||||
Tokens allow grammars to match specific tokenizer tokens rather than character sequences. This is useful for constraining outputs based on special tokens (like `<think>` or `</think>`).
|
||||
|
||||
Tokens can be specified in two ways:
|
||||
|
||||
1. **Token ID**: Use angle brackets with the token ID in square brackets: `<[token-id]>`. For example, `<[1000]>` matches the token with ID 1000.
|
||||
|
||||
2. **Token string**: Use angle brackets with the token text directly: `<token>`. For example, `<think>` will match the token whose text is exactly `<think>`. This only works if the string tokenizes to exactly one token in the vocabulary, otherwise the grammar will fail to parse.
|
||||
|
||||
You can negate token matches using the `!` prefix: `!<[1000]>` or `!<think>` matches any token *except* the specified one.
|
||||
|
||||
```
|
||||
# Match a thinking block: <think>...</think>
|
||||
# Using token strings (requires these to be single tokens in the vocab)
|
||||
root ::= <think> thinking </think> .*
|
||||
thinking ::= !</think>*
|
||||
|
||||
# Equivalent grammar using explicit token IDs
|
||||
# Assumes token 1000 = <think>, token 1001 = </think>
|
||||
root ::= <[1000]> thinking <[1001]> .*
|
||||
thinking ::= !<[1001]>*
|
||||
```
|
||||
|
||||
## Comments and newlines
|
||||
|
||||
Comments can be specified with `#`:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
{%- endmacro %}
|
||||
|
||||
{%- set tool_response_queue = namespace(ids=[]) -%}
|
||||
{%- set tool_call_counter = namespace(value=1) -%}
|
||||
{%- set tool_call_counter = namespace(value=0) -%}
|
||||
|
||||
{%- if tools -%}
|
||||
<|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
|
||||
@@ -36,12 +36,8 @@
|
||||
{%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
|
||||
{{render_content(message)}}<|tool_calls_section_begin|>
|
||||
{%- for tool_call in message['tool_calls'] -%}
|
||||
{%- if tool_call['id'] is defined -%}
|
||||
{%- set formatted_id = tool_call['id'] -%}
|
||||
{%- else -%}
|
||||
{%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%}
|
||||
{%- set tool_call_counter.value = tool_call_counter.value + 1 -%}
|
||||
{%- endif -%}
|
||||
{%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%}
|
||||
{%- set tool_call_counter.value = tool_call_counter.value + 1 -%}
|
||||
{%- set _ = tool_response_queue.ids.append(formatted_id) -%}
|
||||
<|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
|
||||
{%- endfor -%}
|
||||
|
||||
@@ -25,17 +25,13 @@
|
||||
{%- endmacro -%}
|
||||
|
||||
{%- set tool_response_queue = namespace(ids=[]) -%}
|
||||
{%- set tool_call_counter = namespace(value=1) -%}
|
||||
{%- set tool_call_counter = namespace(value=0) -%}
|
||||
|
||||
{%- macro render_toolcalls(message) -%}
|
||||
<|tool_calls_section_begin|>
|
||||
{%- for tool_call in message['tool_calls'] -%}
|
||||
{%- if tool_call['id'] is defined -%}
|
||||
{%- set formatted_id = tool_call['id'] -%}
|
||||
{%- else -%}
|
||||
{%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%}
|
||||
{%- set tool_call_counter.value = tool_call_counter.value + 1 -%}
|
||||
{%- endif -%}
|
||||
{%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%}
|
||||
{%- set tool_call_counter.value = tool_call_counter.value + 1 -%}
|
||||
{%- set _ = tool_response_queue.ids.append(formatted_id) -%}
|
||||
<|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
|
||||
{%- endfor -%}
|
||||
|
||||
@@ -67,7 +67,7 @@ add_library(llama
|
||||
models/gemma-embedding.cpp
|
||||
models/gemma.cpp
|
||||
models/gemma2-iswa.cpp
|
||||
models/gemma3-iswa.cpp
|
||||
models/gemma3.cpp
|
||||
models/gemma3n-iswa.cpp
|
||||
models/glm4-moe.cpp
|
||||
models/glm4.cpp
|
||||
|
||||
@@ -248,7 +248,10 @@ llama_context::llama_context(
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes();
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
@@ -300,9 +303,6 @@ llama_context::llama_context(
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
n_outputs = n_seqs;
|
||||
|
||||
@@ -1386,9 +1386,9 @@ void llama_context::output_reorder() {
|
||||
// graph
|
||||
//
|
||||
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
||||
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
|
||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
}
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
}
|
||||
|
||||
@@ -197,7 +197,7 @@ private:
|
||||
//
|
||||
|
||||
public:
|
||||
uint32_t graph_max_nodes() const;
|
||||
uint32_t graph_max_nodes(uint32_t n_tokens) const;
|
||||
|
||||
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
||||
llm_graph_result * get_gf_res_reserve() const;
|
||||
|
||||
@@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
|
||||
static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
|
||||
const char * pos = src;
|
||||
if (*pos != '<') {
|
||||
throw std::runtime_error(std::string("expecting '<' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
|
||||
// Parse <[id]>
|
||||
if (*pos == '[') {
|
||||
pos++;
|
||||
const char * int_end = parse_int(pos);
|
||||
uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = int_end;
|
||||
if (*pos != ']') {
|
||||
throw std::runtime_error(std::string("expecting ']' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
if (*pos != '>') {
|
||||
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
return std::make_pair(token_id, pos);
|
||||
}
|
||||
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error(std::string("no vocab to parse token at ") + src);
|
||||
}
|
||||
|
||||
// Parse <token> and tokenize to obtain the token id
|
||||
while (*pos != 0 && *pos != '>') {
|
||||
pos++;
|
||||
}
|
||||
if (*pos != '>') {
|
||||
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
|
||||
llama_token tokens[2];
|
||||
int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
|
||||
if (n_tokens != 1) {
|
||||
// must tokenize to exactly 1 token
|
||||
throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
|
||||
}
|
||||
return std::make_pair(tokens[0], pos);
|
||||
}
|
||||
|
||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||
if (0x20 <= c && c <= 0x7f) {
|
||||
fprintf(file, "%c", static_cast<char>(c));
|
||||
@@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||
case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
|
||||
}
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END:
|
||||
@@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
||||
print_grammar_char(file, elem.value);
|
||||
fprintf(file, "\") ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
fprintf(file, "!");
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
fprintf(file, "\n");
|
||||
@@ -284,6 +343,17 @@ static void print_rule(
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
fprintf(file, ".");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
fprintf(file, "!");
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
}
|
||||
if (is_char_element(elem)) {
|
||||
switch (rule[i + 1].type) {
|
||||
@@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
|
||||
}
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '<' || *pos == '!') { // token
|
||||
auto type = LLAMA_GRETYPE_TOKEN;
|
||||
if (*pos == '!') { // token inverse
|
||||
type = LLAMA_GRETYPE_TOKEN_NOT;
|
||||
pos++;
|
||||
}
|
||||
auto token_pair = parse_token(vocab, pos);
|
||||
const char * token_end = token_pair.second;
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({type, token_pair.first});
|
||||
pos = parse_space(token_end, is_nested);
|
||||
} else if (is_word_char(*pos)) { // rule reference
|
||||
const char * name_end = parse_name(pos);
|
||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||
@@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
|
||||
return !is_positive_char;
|
||||
}
|
||||
|
||||
// returns true iff token matches the rule at pos (regular or inverse)
|
||||
// asserts that pos is pointing to a token element
|
||||
static bool llama_grammar_match_token(
|
||||
const llama_grammar_element * pos,
|
||||
const llama_token token) {
|
||||
GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN) {
|
||||
return pos->value == static_cast<uint32_t>(token);
|
||||
}
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
return pos->value != static_cast<uint32_t>(token);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||
// at a character range (terminal element)
|
||||
static void llama_grammar_advance_stack(
|
||||
@@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
@@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
||||
return grammar->stacks;
|
||||
}
|
||||
|
||||
static void llama_grammar_accept_chr(
|
||||
struct llama_grammar & grammar,
|
||||
const llama_grammar_stack & stack,
|
||||
uint32_t chr,
|
||||
llama_grammar_stacks & new_stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
// ignore if this turns into a token
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(pos, chr);
|
||||
if (match.first) {
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(match.second)) {
|
||||
new_stack.push_back(match.second);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
||||
llama_grammar_stacks stacks_new;
|
||||
stacks_new.reserve(grammar->stacks.size());
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
||||
if (match.first) {
|
||||
const llama_grammar_element * pos = match.second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
new_stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
||||
}
|
||||
llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
|
||||
}
|
||||
|
||||
grammar->stacks = std::move(stacks_new);
|
||||
@@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
// if the top of the stack is a token rule, then we only need to check the token id
|
||||
if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached the end of a token consumed by char rules, reject iff it ended
|
||||
// in a partial response
|
||||
if (tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (!llama_grammar_match_token(stack_pos, tok.id)) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
llama_grammar_candidates next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
@@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
|
||||
} else {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
@@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
|
||||
}
|
||||
|
||||
return rejects;
|
||||
@@ -972,12 +1098,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy =*/ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_buffer_positions = */ {},
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -990,7 +1117,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
size_t num_trigger_patterns,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
llama_grammar_parser parser;
|
||||
llama_grammar_parser parser(vocab);
|
||||
|
||||
// if there is a grammar, parse it
|
||||
// rules will be empty (default) if there are parse errors
|
||||
@@ -1077,10 +1204,11 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_buffer_positions = */ {},
|
||||
std::move(vec_trigger_tokens),
|
||||
std::move(vec_trigger_patterns),
|
||||
};
|
||||
@@ -1103,6 +1231,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
grammar.lazy,
|
||||
grammar.awaiting_trigger,
|
||||
grammar.trigger_buffer,
|
||||
grammar.trigger_buffer_positions,
|
||||
grammar.trigger_tokens,
|
||||
grammar.trigger_patterns,
|
||||
};
|
||||
@@ -1156,7 +1285,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1175,10 +1304,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||
grammar.awaiting_trigger = false;
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
||||
return;
|
||||
} else {
|
||||
auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
|
||||
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
||||
grammar.trigger_buffer += piece;
|
||||
|
||||
std::smatch match;
|
||||
@@ -1196,10 +1327,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
if (start == std::string::npos) {
|
||||
start = match.position(0);
|
||||
}
|
||||
|
||||
// replay tokens that overlap with [start, end)
|
||||
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
||||
auto [tok_start, tok_end] = tok_pos;
|
||||
if (tok_end <= start) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
|
||||
size_t piece_len = tok_end - piece_start;
|
||||
auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
|
||||
llama_grammar_accept_token(grammar, tok, tok_piece);
|
||||
}
|
||||
|
||||
auto constrained_str = grammar.trigger_buffer.substr(start);
|
||||
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, constrained_str);
|
||||
grammar.trigger_buffer_positions.clear();
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
||||
return;
|
||||
}
|
||||
@@ -1218,7 +1362,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
}
|
||||
|
||||
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
||||
@@ -1235,3 +1379,59 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
llama_grammar_stacks stacks_new;
|
||||
stacks_new.reserve(grammar.stacks.size());
|
||||
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
if (llama_grammar_match_token(pos, token)) {
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||
new_stack.push_back(pos + 1);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
|
||||
}
|
||||
} else {
|
||||
llama_grammar_stacks current_stacks = {stack};
|
||||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
llama_grammar_stacks next_stacks;
|
||||
|
||||
for (const auto & cur_stack : current_stacks) {
|
||||
llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
|
||||
}
|
||||
|
||||
current_stacks = std::move(next_stacks);
|
||||
if (current_stacks.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto & surviving_stack : current_stacks) {
|
||||
if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
|
||||
stacks_new.emplace_back(surviving_stack);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
grammar.stacks = std::move(stacks_new);
|
||||
grammar.partial_utf8 = decoded.second;
|
||||
|
||||
if (grammar.stacks.empty()) {
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,11 +36,17 @@ enum llama_gretype {
|
||||
|
||||
// any character (.)
|
||||
LLAMA_GRETYPE_CHAR_ANY = 7,
|
||||
|
||||
// terminal element: token (<[token-id]>)
|
||||
LLAMA_GRETYPE_TOKEN = 8,
|
||||
|
||||
// inverse token (!<[token-id]>)
|
||||
LLAMA_GRETYPE_TOKEN_NOT = 9,
|
||||
};
|
||||
|
||||
typedef struct llama_grammar_element {
|
||||
enum llama_gretype type;
|
||||
uint32_t value; // Unicode code point or rule ID
|
||||
uint32_t value; // Unicode code point, rule ID, or token ID
|
||||
} llama_grammar_element;
|
||||
|
||||
struct llama_partial_utf8 {
|
||||
@@ -52,6 +58,7 @@ struct llama_grammar_candidate {
|
||||
size_t index;
|
||||
const uint32_t * code_points;
|
||||
llama_partial_utf8 partial_utf8;
|
||||
llama_token id;
|
||||
};
|
||||
|
||||
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
||||
@@ -77,10 +84,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
||||
const llama_grammar_candidates & candidates);
|
||||
|
||||
struct llama_grammar_parser {
|
||||
const llama_vocab * vocab;
|
||||
std::map<std::string, uint32_t> symbol_ids;
|
||||
|
||||
llama_grammar_rules rules;
|
||||
|
||||
llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
|
||||
|
||||
llama_grammar_stack c_rules() const;
|
||||
|
||||
uint32_t get_symbol_id(const char * src, size_t len);
|
||||
@@ -112,6 +122,9 @@ struct llama_grammar_trigger_pattern {
|
||||
};
|
||||
|
||||
struct llama_grammar {
|
||||
// maintain a list of llama_tokens and their positions in the trigger_buffer
|
||||
using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
|
||||
|
||||
// note: allow null vocab for testing (not great)
|
||||
const llama_vocab * vocab;
|
||||
|
||||
@@ -127,6 +140,7 @@ struct llama_grammar {
|
||||
bool lazy = false;
|
||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||
std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
|
||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||
std::vector<llama_grammar_trigger_pattern>
|
||||
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
||||
@@ -171,3 +185,8 @@ void llama_grammar_accept_impl(
|
||||
void llama_grammar_accept_str(
|
||||
struct llama_grammar & grammar,
|
||||
const std::string & piece);
|
||||
|
||||
void llama_grammar_accept_token(
|
||||
struct llama_grammar & grammar,
|
||||
llama_token token,
|
||||
const std::string & piece);
|
||||
|
||||
@@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
|
||||
// mask out the other groups
|
||||
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
||||
selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
||||
cb(selection_probs, "ffn_moe_probs_masked", il);
|
||||
}
|
||||
|
||||
@@ -1264,18 +1264,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
hparams.f_final_logit_softcapping = 0.0f;
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: type = LLM_TYPE_270M; break;
|
||||
case 26: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
||||
case 34: type = LLM_TYPE_4B; break;
|
||||
case 48: type = LLM_TYPE_12B; break;
|
||||
case 62: type = LLM_TYPE_27B; break;
|
||||
@@ -1628,6 +1635,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
@@ -7300,7 +7311,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
{
|
||||
|
||||
@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
// {n_embd, n_tokens}
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ggml_tensor * inp_attn_scale = nullptr;
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
inp_attn_scale = build_inp_attn_scale();
|
||||
}
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
ggml_tensor * Vcur = kv_cmpr;
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
if (inp_attn_scale) {
|
||||
// apply llama 4 temperature scaling
|
||||
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
if (inp_attn_scale) {
|
||||
// apply llama 4 temperature scaling
|
||||
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
template <bool iswa>
|
||||
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
|
||||
ggml_tensor * cur;
|
||||
@@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// TODO: is causal == true correct? might need some changes
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
float freq_base_l = 0.0f;
|
||||
float freq_scale_l = 0.0f;
|
||||
|
||||
if constexpr (iswa) {
|
||||
freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
} else {
|
||||
freq_base_l = freq_base;
|
||||
freq_scale_l = freq_scale;
|
||||
}
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
@@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
||||
cur = build_norm(cur,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
|
||||
@@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
if (hparams.f_final_logit_softcapping) {
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template struct llm_build_gemma3<false>;
|
||||
template struct llm_build_gemma3<true>;
|
||||
@@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
||||
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_gemma3_iswa : public llm_graph_context {
|
||||
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
template <bool iswa>
|
||||
struct llm_build_gemma3 : public llm_graph_context {
|
||||
llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||
|
||||
@@ -428,10 +428,38 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
|
||||
*/
|
||||
template <typename T>
|
||||
static void test_parser_with_streaming(const common_chat_msg & expected, const std::string & raw_message, T parse_msg) {
|
||||
constexpr auto utf8_truncate_safe_len = [](const std::string_view s) -> size_t {
|
||||
auto len = s.size();
|
||||
if (len == 0) return 0;
|
||||
auto i = len;
|
||||
for (size_t back = 0; back < 4 && i > 0; ++back) {
|
||||
--i;
|
||||
unsigned char c = s[i];
|
||||
if ((c & 0x80) == 0) {
|
||||
return len;
|
||||
} else if ((c & 0xC0) == 0xC0) {
|
||||
size_t expected_len = 0;
|
||||
if ((c & 0xE0) == 0xC0) expected_len = 2;
|
||||
else if ((c & 0xF0) == 0xE0) expected_len = 3;
|
||||
else if ((c & 0xF8) == 0xF0) expected_len = 4;
|
||||
else return i;
|
||||
if (len - i >= expected_len) {
|
||||
return len;
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return len - std::min(len, size_t(3));
|
||||
};
|
||||
constexpr auto utf8_truncate_safe_view = [utf8_truncate_safe_len](const std::string_view s) {
|
||||
return s.substr(0, utf8_truncate_safe_len(s));
|
||||
};
|
||||
|
||||
auto merged = simple_assist_msg("");
|
||||
auto last_msg = parse_msg("");
|
||||
for (size_t i = 1; i <= raw_message.size(); ++i) {
|
||||
auto curr_msg = parse_msg(raw_message.substr(0, i));
|
||||
auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i))));
|
||||
if (curr_msg == simple_assist_msg("")) continue;
|
||||
LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat<json>({curr_msg}).dump().c_str());
|
||||
for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) {
|
||||
@@ -2659,14 +2687,14 @@ Hey there!<|im_end|>
|
||||
// Test parsing tool calls
|
||||
assert_msg_equals(message_assist_call,
|
||||
common_chat_parse(
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_KIMI_K2}));
|
||||
|
||||
// Test parsing tool calls with thinking
|
||||
assert_msg_equals(message_assist_call_thoughts,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
@@ -2676,7 +2704,7 @@ Hey there!<|im_end|>
|
||||
// Test tool calls with extra content
|
||||
assert_msg_equals(message_assist_call_content,
|
||||
common_chat_parse(
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{COMMON_CHAT_FORMAT_KIMI_K2}
|
||||
));
|
||||
@@ -2684,7 +2712,7 @@ Hey there!<|im_end|>
|
||||
// Test tool calls with extra content AND thinking
|
||||
assert_msg_equals(message_assist_call_thoughts_content,
|
||||
common_chat_parse(
|
||||
"<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
|
||||
"<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
|
||||
/* is_partial= */ false,
|
||||
{
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
@@ -2693,47 +2721,152 @@ Hey there!<|im_end|>
|
||||
|
||||
// Test streaming
|
||||
test_parser_with_streaming(message_assist_call_thoughts_content,
|
||||
"<think>I'm\nthinking\n</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think>I'm\nthinking\n</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(message_assist_call_thoughts_unparsed,
|
||||
"<think>I'm\nthinking</think>\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think>I'm\nthinking</think>\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
|
||||
}); });
|
||||
test_parser_with_streaming(message_assist_call_thoughts_content,
|
||||
"<think>I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n",
|
||||
"<think>I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(message_assist_call_withopt,
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:1<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
|
||||
}); });
|
||||
test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": \"123456\"}"),
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": [1, 2, \"345\", 6]}"),
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}"),
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
/* .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(
|
||||
simple_assist_msg("", "", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>"
|
||||
"{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
|
||||
"<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
|
||||
test_parser_with_streaming(
|
||||
simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"),
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:0<|tool_call_argument_begin|>"
|
||||
"{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"
|
||||
"<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
|
||||
test_parser_with_streaming(
|
||||
simple_assist_msg("", "", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"),
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
|
||||
"{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"
|
||||
"<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
|
||||
test_parser_with_streaming(
|
||||
simple_assist_msg(
|
||||
"Let me start by examining the relevant files to understand the current implementation.", "",
|
||||
"read_file",
|
||||
"{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}"),
|
||||
"Let me start by examining the relevant files to understand the current implementation."
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
|
||||
"{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}"
|
||||
"<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
|
||||
auto multi_tool_msg = simple_assist_msg("Let me call multiple tools.", "I'm thinking.");
|
||||
multi_tool_msg.tool_calls.push_back({ "read_file", "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}", "" });
|
||||
multi_tool_msg.tool_calls.push_back({ "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}", "" });
|
||||
multi_tool_msg.tool_calls.push_back({ "complex_function", "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}", "" });
|
||||
multi_tool_msg.tool_calls.push_back({ "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" });
|
||||
test_parser_with_streaming(multi_tool_msg,
|
||||
"<think>I'm thinking.</think>Let me call multiple tools."
|
||||
"<|tool_calls_section_begin|>"
|
||||
"<|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
|
||||
"{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}"
|
||||
"<|tool_call_end|>"
|
||||
"<|tool_call_begin|>functions.web_search:1<|tool_call_argument_begin|>"
|
||||
"{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"
|
||||
"<|tool_call_end|>"
|
||||
"<|tool_call_begin|>functions.complex_function:2<|tool_call_argument_begin|>"
|
||||
"{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
|
||||
"<|tool_call_end|>"
|
||||
"<|tool_call_begin|>functions.emoji_function:3<|tool_call_argument_begin|>"
|
||||
"{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}"
|
||||
"<|tool_call_end|>"
|
||||
"<|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(
|
||||
simple_assist_msg("", "I'm thinking", "complex_function_in_think", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
|
||||
"<think>I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>"
|
||||
"{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
|
||||
"<|tool_call_end|><|tool_calls_section_end|>",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
test_parser_with_streaming(
|
||||
simple_assist_msg("Hello", "I'm thinkingI'm still thinking", "complex_function_in_think", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"),
|
||||
"<think>I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>"
|
||||
"{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
|
||||
"<|tool_call_end|><|tool_calls_section_end|>I'm still thinking</think>Hello",
|
||||
[&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
|
||||
COMMON_CHAT_FORMAT_KIMI_K2,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK
|
||||
}); });
|
||||
|
||||
// Test template rendering
|
||||
common_chat_templates_inputs conversation_with_tools = inputs_tools;
|
||||
conversation_with_tools.messages.push_back(simple_assist_msg("Let's do it", "Think first", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"));
|
||||
conversation_with_tools.messages.push_back({
|
||||
"tool",
|
||||
"Tool response 1",
|
||||
/* .content_parts = */ {},
|
||||
/* .tool_calls = */ {},
|
||||
/* .reasoning_content = */ "",
|
||||
/* .tool_name = */ "complex_function",
|
||||
/* .tool_call_id = */ "",
|
||||
});
|
||||
conversation_with_tools.messages.push_back(simple_assist_msg("Continue", "Think next", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"));
|
||||
conversation_with_tools.messages.push_back({
|
||||
"tool",
|
||||
"Tool response 2",
|
||||
/* .content_parts = */ {},
|
||||
/* .tool_calls = */ {},
|
||||
/* .reasoning_content = */ "",
|
||||
/* .tool_name = */ "web_search",
|
||||
/* .tool_call_id = */ "",
|
||||
});
|
||||
conversation_with_tools.messages.push_back(simple_assist_msg("CC", "Think last", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"));
|
||||
conversation_with_tools.messages.push_back({
|
||||
"tool",
|
||||
"Tool response 3",
|
||||
/* .content_parts = */ {},
|
||||
/* .tool_calls = */ {},
|
||||
/* .reasoning_content = */ "",
|
||||
/* .tool_name = */ "read_file",
|
||||
/* .tool_call_id = */ "",
|
||||
});
|
||||
assert_equals(common_chat_templates_apply(tmpls.get(), conversation_with_tools).prompt, std::string("<|im_system|>tool_declare<|im_middle|>[{\"type\": \"function\", \"function\": {\"name\": \"special_function\", \"description\": \"I'm special\", \"parameters\": {\"type\": \"object\", \"properties\": {\"arg1\": {\"type\": \"integer\", \"description\": \"The arg.\"}}, \"required\": [\"arg1\"]}}}]<|im_end|><|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|><|im_user|>user<|im_middle|>Hey there!<|im_end|><|im_assistant|>assistant<|im_middle|><think>Think first</think>Let's do it<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>complex_function<|im_middle|>## Return of functions.complex_function:0\nTool response 1<|im_end|><|im_assistant|>assistant<|im_middle|><think>Think next</think>Continue<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:1<|tool_call_argument_begin|>{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>web_search<|im_middle|>## Return of functions.web_search:1\nTool response 2<|im_end|><|im_assistant|>assistant<|im_middle|><think>Think last</think>CC<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:2<|tool_call_argument_begin|>{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>read_file<|im_middle|>## Return of functions.read_file:2\nTool response 3<|im_end|><|im_assistant|>assistant<|im_middle|>"));
|
||||
|
||||
// Test template generation for regular content
|
||||
test_templates(tmpls.get(), end_tokens, message_assist, tools,
|
||||
@@ -2742,7 +2875,7 @@ Hey there!<|im_end|>
|
||||
|
||||
// Test template generation for tool calls
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
|
||||
"<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
/* expect_grammar_triggered= */ true,
|
||||
/* test_grammar_if_triggered= */ true,
|
||||
/* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
@@ -2751,14 +2884,14 @@ Hey there!<|im_end|>
|
||||
|
||||
// Test template generation for tools with optional parameters
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_call_noopt, tools,
|
||||
"<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
/* expect_grammar_triggered= */ true,
|
||||
/* test_grammar_if_triggered= */ true,
|
||||
/* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
/* ignore_whitespace_differences= */ true
|
||||
);
|
||||
test_templates(tmpls.get(), end_tokens, message_assist_call_withopt, tools,
|
||||
"<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:1<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
"<think></think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
|
||||
/* expect_grammar_triggered= */ true,
|
||||
/* test_grammar_if_triggered= */ true,
|
||||
/* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK,
|
||||
|
||||
@@ -32,13 +32,66 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||
return grammar_fails;
|
||||
}
|
||||
|
||||
struct token_and_piece {
|
||||
llama_token token;
|
||||
std::string piece;
|
||||
};
|
||||
|
||||
// token() encodes a 32-bit ID as 5 bytes: a 0xff marker followed by the ID in big-endian order.
|
||||
static std::string token(llama_token id) {
|
||||
return std::string{
|
||||
static_cast<char>(0xff),
|
||||
static_cast<char>((id >> 24) & 0xff),
|
||||
static_cast<char>((id >> 16) & 0xff),
|
||||
static_cast<char>((id >> 8) & 0xff),
|
||||
static_cast<char>(id & 0xff)
|
||||
};
|
||||
}
|
||||
|
||||
// parse_tokens() parses the token encodes above and UTF-8 text.
|
||||
static std::vector<token_and_piece> parse_tokens(const std::string & input) {
|
||||
std::vector<token_and_piece> result;
|
||||
result.reserve(input.size());
|
||||
size_t offset = 0;
|
||||
while (offset < input.size()) {
|
||||
try {
|
||||
if (static_cast<unsigned char>(input[offset]) == 0xff) {
|
||||
if (offset + 5 > input.size()) {
|
||||
throw std::runtime_error("not enough bytes for token id");
|
||||
}
|
||||
uint32_t val =
|
||||
(static_cast<unsigned char>(input[offset + 1]) << 24) |
|
||||
(static_cast<unsigned char>(input[offset + 2]) << 16) |
|
||||
(static_cast<unsigned char>(input[offset + 3]) << 8) |
|
||||
(static_cast<unsigned char>(input[offset + 4]));
|
||||
auto piece = "<[" + std::to_string(val) + "]>";
|
||||
result.push_back({static_cast<llama_token>(val), piece});
|
||||
offset += 5;
|
||||
} else {
|
||||
uint32_t cpt = unicode_cpt_from_utf8(input, offset);
|
||||
result.push_back({0, unicode_cpt_to_utf8(cpt)});
|
||||
}
|
||||
} catch (const std::invalid_argument & /*ex*/) {
|
||||
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
|
||||
++offset;
|
||||
result.push_back({0, unicode_cpt_to_utf8(0xFFFD)}); // replacement character
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool match_string(const std::string & input, llama_grammar * grammar) {
|
||||
const auto cpts = unicode_cpts_from_utf8(input);
|
||||
const auto parsed = parse_tokens(input);
|
||||
|
||||
auto & stacks_cur = llama_grammar_get_stacks(grammar);
|
||||
|
||||
for (const auto & cpt : cpts) {
|
||||
llama_grammar_accept(grammar, cpt);
|
||||
for (const auto & in : parsed) {
|
||||
try {
|
||||
llama_grammar_accept_token(*grammar, in.token, in.piece);
|
||||
} catch (const std::runtime_error & /*e*/) {
|
||||
// normally this shouldn't get hit because of llama_grammar_apply
|
||||
return false;
|
||||
}
|
||||
|
||||
if (stacks_cur.empty()) {
|
||||
// no stacks means that the grammar failed to match at this point
|
||||
@@ -426,6 +479,30 @@ static void test_simple_grammar() {
|
||||
"12a45",
|
||||
}
|
||||
);
|
||||
|
||||
// Test case for a simple grammar with tokens
|
||||
test_grammar(
|
||||
"simple grammar with tokens",
|
||||
R"""(
|
||||
root ::= <[10]> content <[11]>
|
||||
content ::= (!<[11]>)*)""",
|
||||
// Passing strings
|
||||
{
|
||||
token(10) + "hello world" + token(11),
|
||||
token(10) + "text with " + token(12) + " other tokens " + token(13) + " mixed in" + token(11),
|
||||
token(10) + token(11),
|
||||
token(10) + token(12) + token(13) + token(14) + token(15) + token(11),
|
||||
token(10) + "a" + token(11),
|
||||
},
|
||||
// Failing strings
|
||||
{
|
||||
token(10) + "missing end token",
|
||||
token(10),
|
||||
"missing start token" + token(11),
|
||||
token(10) + token(11) + token(11), // double end token
|
||||
token(11) + "wrong order" + token(10),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
static void test_complex_grammar() {
|
||||
@@ -487,6 +564,34 @@ static void test_complex_grammar() {
|
||||
"123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
|
||||
}
|
||||
);
|
||||
|
||||
// Test case for a more complex grammar with tokens
|
||||
test_grammar(
|
||||
"complex grammar with tokens",
|
||||
R"""(
|
||||
root ::= reasoning+ content tool-call*
|
||||
reasoning ::= <[10]> (!<[11]>)* <[11]>
|
||||
content ::= <[20]> (!<[21]>)* <[21]>
|
||||
tool-call ::= <[12]> name <[13]> args <[14]>
|
||||
name ::= (!<[13]>)+
|
||||
args ::= (!<[14]>)*)""",
|
||||
// Passing strings
|
||||
{
|
||||
token(10) + "I am thinking" + token(11) + token(20) + "hello world!" + token(21) + token(12) + "search" + token(13) + "query=test" + token(14),
|
||||
token(10) + "reasoning 1" + token(11) + token(10) + "reasoning 2" + token(11) + token(20) + token(21) + token(12) + "tool" + token(13) + token(14),
|
||||
token(10) + token(11) + token(20) + "content" + token(21),
|
||||
token(10) + "think" + token(12) + " nested" + token(11) + token(20) + token(10) + "more content" + token(21) + token(12) + "fn" + token(13) + "x=1,y=2" + token(14) + token(12) + "fn2" + token(13) + token(14),
|
||||
token(10) + "reasoning" + token(11) + token(10) + "more" + token(11) + token(10) + "even more" + token(11) + token(20) + "text" + token(21) + token(12) + "a" + token(13) + "b" + token(14) + token(12) + "c" + token(13) + "d" + token(14),
|
||||
},
|
||||
// Failing strings
|
||||
{
|
||||
token(20) + "content only" + token(21),
|
||||
token(10) + "no closing reasoning",
|
||||
token(10) + token(11) + token(20) + "no closing content",
|
||||
token(10) + token(11) + token(20) + token(21) + token(12) + "incomplete tool",
|
||||
token(10) + token(11) + token(11) + token(20) + token(21),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
static void test_special_chars() {
|
||||
|
||||
@@ -515,5 +515,19 @@ int main()
|
||||
{LLAMA_GRETYPE_END, 0},
|
||||
});
|
||||
|
||||
// <[1000]> = "<think>"
|
||||
// <[1001]> = "</think>"
|
||||
verify_parsing(R"""(
|
||||
root ::= <[1000]> !<[1001]> <[1001]>
|
||||
)""", {
|
||||
{"root", 0}
|
||||
}, {
|
||||
// root (index 0)
|
||||
{LLAMA_GRETYPE_TOKEN, 1000},
|
||||
{LLAMA_GRETYPE_TOKEN_NOT, 1001},
|
||||
{LLAMA_GRETYPE_TOKEN, 1001},
|
||||
{LLAMA_GRETYPE_END, 0},
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -202,7 +202,7 @@ int main()
|
||||
uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
|
||||
cp[0] = 37 + i;
|
||||
cp[1] = 0;
|
||||
next_candidates[i] = {i, cp, {}};
|
||||
next_candidates[i] = {i, cp, {}, 0};
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
|
||||
|
||||
177
tools/server/README-dev.md
Normal file
177
tools/server/README-dev.md
Normal file
@@ -0,0 +1,177 @@
|
||||
# llama-server Development Documentation
|
||||
|
||||
This document provides an in-depth technical overview of `llama-server`, intended for maintainers and contributors.
|
||||
|
||||
If you are an end user consuming `llama-server` as a product, please refer to the main [README](./README.md) instead.
|
||||
|
||||
## Backend
|
||||
|
||||
### Overview
|
||||
|
||||
The server supports two primary operating modes:
|
||||
|
||||
- **Inference mode**: The default mode for performing inference with a single loaded GGUF model.
|
||||
- **Router mode**: Enables management of multiple inference server instances behind a single API endpoint. Requests are automatically routed to the appropriate backend instance based on the requested model.
|
||||
|
||||
The core architecture consists of the following components:
|
||||
|
||||
- `server_context`: Holds the primary inference state, including the main `llama_context` and all active slots.
|
||||
- `server_slot`: An abstraction over a single “sequence” in llama.cpp, responsible for managing individual parallel inference requests.
|
||||
- `server_routes`: Middleware layer between `server_context` and the HTTP interface; handles JSON parsing/formatting and request routing logic.
|
||||
- `server_http_context`: Implements the HTTP server using `cpp-httplib`.
|
||||
- `server_queue`: Thread-safe queue used by HTTP workers to submit new tasks to `server_context`.
|
||||
- `server_response`: Thread-safe queue used by `server_context` to return results to HTTP workers.
|
||||
- `server_response_reader`: Higher-level wrapper around the two queues above for cleaner code.
|
||||
- `server_task`: Unit of work pushed into `server_queue`.
|
||||
- `server_task_result`: Unit of result pushed into `server_response`.
|
||||
- `server_tokens`: Unified representation of token sequences (supports both text and multimodal tokens); used by `server_task` and `server_slot`.
|
||||
- `server_prompt_checkpoint`: For recurrent (e.g., RWKV) and SWA models, stores snapshots of KV cache state. Enables reuse when subsequent requests share the same prompt prefix, saving redundant computation.
|
||||
- `server_models`: Standalone component for managing multiple backend instances (used in router mode). It is completely independent of `server_context`.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
API_User <--> server_http_context
|
||||
server_http_context <-- router mode --> server_models
|
||||
server_http_context <-- inference mode --> server_routes
|
||||
server_routes -- server_task --> server_queue
|
||||
subgraph server_context
|
||||
server_queue --> server_slot
|
||||
server_slot -- server_task_result --> server_response
|
||||
server_slot[multiple server_slot]
|
||||
end
|
||||
server_response --> server_routes
|
||||
```
|
||||
|
||||
### Batching
|
||||
|
||||
The server context maintains a single batch shared across all slots. When `update_slots()` is invoked, the system iterates through all active slots to populate this batch. For each slot, either a generated token from the previous decoding step or available prompt tokens are added to the batch.
|
||||
|
||||
Batching constraints apply: slots can only be batched together if they share compatible configurations. For instance, slots using a specific LoRA adapter can be batched with each other, but not with slots using a different LoRA adapter or no adapter at all.
|
||||
|
||||
Once the batch reaches capacity or all slots have been processed, `llama_decode` is called to execute the inference. This operation represents the primary computational bottleneck in `update_slots()`.
|
||||
|
||||
Following decoding, the system either retrieves embeddings or samples the next token using `common_sampler_sample`. If a slot has remaining prompt tokens to process, it yields until the next `update_slots()` iteration.
|
||||
|
||||
### Thread Management
|
||||
|
||||
`server_context` runs on a dedicated single thread. Because it is single-threaded, heavy post-processing (especially after token generation) should be avoided, as it directly impacts multi-sequence throughput.
|
||||
|
||||
Each incoming HTTP request is handled by its own thread managed by the HTTP library. The following operations are performed in HTTP worker threads:
|
||||
|
||||
- JSON request parsing
|
||||
- Chat template application
|
||||
- Tokenization
|
||||
- Conversion of `server_task_result` into final JSON response
|
||||
- Error formatting into JSON
|
||||
- Tracking of partial/incremental responses (e.g., streaming tool calls or reasoning steps)
|
||||
|
||||
**Best practices to follow:**
|
||||
|
||||
- All JSON formatting and chat template logic must stay in the HTTP layer.
|
||||
- Avoid passing raw JSON between the HTTP layer and `server_slot`. Instead, parse everything into native C++ types as early as possible.
|
||||
|
||||
### Example trace of a request
|
||||
|
||||
Here is an example trace of an API request for text completion:
|
||||
|
||||
- A request arrives at the HTTP layer.
|
||||
- The request is routed to the corresponding handler inside `server_routes`. In this case, `handle_completions_impl` is invoked.
|
||||
- The handler parses the input request, constructs a new `server_task`, and passes it to `server_res_generator`.
|
||||
- `server_res_generator` creates a new `task_result_state` for each task:
|
||||
- `task_result_state` stays in the HTTP layer, responsible for keeping track of the current state of the response (e.g., parsing tool calls or thinking messages).
|
||||
- `server_task` is moved into `server_queue` inside `server_context`.
|
||||
- `server_context` launches the task by moving it into an available slot (see `launch_slot_with_task()`).
|
||||
- `update_slot()` processes the task as described in the "Batching" section above.
|
||||
- Results may be sent using `send_partial_response` or `send_final_response`, which creates a new `server_task_result` and pushes it to the response queue.
|
||||
- At the same time, `server_res_generator` listens to the response queue and retrieves this response.
|
||||
- As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state.
|
||||
- `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer.
|
||||
|
||||
### Testing
|
||||
|
||||
`llama-server` includes an automated test suite based on `pytest`.
|
||||
|
||||
The framework automatically starts a `llama-server` instance, sends requests, and validates responses.
|
||||
|
||||
For detailed instructions, see the [test documentation](./tests/README.md).
|
||||
|
||||
### Notable Related PRs
|
||||
|
||||
- Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443
|
||||
- Parallel decoding support: https://github.com/ggml-org/llama.cpp/pull/3228
|
||||
- Refactor introducing `server_queue` and `server_response`: https://github.com/ggml-org/llama.cpp/pull/5065
|
||||
- Reranking endpoint: https://github.com/ggml-org/llama.cpp/pull/9510
|
||||
- Multimodal model support (`libmtmd`): https://github.com/ggml-org/llama.cpp/pull/12898
|
||||
- Unified KV cache handling: https://github.com/ggml-org/llama.cpp/pull/16736
|
||||
- Separation of HTTP logic into dedicated files: https://github.com/ggml-org/llama.cpp/pull/17216
|
||||
- Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362
|
||||
- Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470
|
||||
- Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
|
||||
|
||||
|
||||
|
||||
|
||||
## Web UI
|
||||
|
||||
The project includes a web-based user interface for interacting with `llama-server`. It supports both single-model (`MODEL` mode) and multi-model (`ROUTER` mode) operation.
|
||||
|
||||
The SvelteKit-based Web UI is introduced in this PR: https://github.com/ggml-org/llama.cpp/pull/14839
|
||||
|
||||
### Features
|
||||
|
||||
- **Chat interface** with streaming responses
|
||||
- **Multi-model support** (ROUTER mode) - switch between models, auto-load on selection
|
||||
- **Modality validation** - ensures selected model supports conversation's attachments (images, audio)
|
||||
- **Conversation management** - branching, regeneration, editing with history preservation
|
||||
- **Attachment support** - images, audio, PDFs (with vision/text fallback)
|
||||
- **Configurable parameters** - temperature, top_p, etc. synced with server defaults
|
||||
- **Dark/light theme**
|
||||
|
||||
### Tech Stack
|
||||
|
||||
- **SvelteKit** - frontend framework with Svelte 5 runes for reactive state
|
||||
- **TailwindCSS** + **shadcn-svelte** - styling and UI components
|
||||
- **Vite** - build tooling
|
||||
- **IndexedDB** (Dexie) - local storage for conversations
|
||||
- **LocalStorage** - user settings persistence
|
||||
|
||||
### Architecture
|
||||
|
||||
The WebUI follows a layered architecture:
|
||||
|
||||
```
|
||||
Routes → Components → Hooks → Stores → Services → Storage/API
|
||||
```
|
||||
|
||||
- **Stores** - reactive state management (`chatStore`, `conversationsStore`, `modelsStore`, `serverStore`, `settingsStore`)
|
||||
- **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`)
|
||||
- **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`)
|
||||
|
||||
For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/):
|
||||
|
||||
- `high-level-architecture.mmd` - full architecture with all modules
|
||||
- `high-level-architecture-simplified.mmd` - simplified overview
|
||||
- `data-flow-simplified-model-mode.mmd` - data flow for single-model mode
|
||||
- `data-flow-simplified-router-mode.mmd` - data flow for multi-model mode
|
||||
- `flows/*.mmd` - detailed per-domain flows (chat, conversations, models, etc.)
|
||||
|
||||
### Development
|
||||
|
||||
```sh
|
||||
# make sure you have Node.js installed
|
||||
cd tools/server/webui
|
||||
npm i
|
||||
|
||||
# run dev server (with hot reload)
|
||||
npm run dev
|
||||
|
||||
# run tests
|
||||
npm run test
|
||||
|
||||
# build production bundle
|
||||
npm run build
|
||||
```
|
||||
|
||||
After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.
|
||||
|
||||
**Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development.
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
|
||||
|
||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
Set of LLM REST APIs and a web UI to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
@@ -19,7 +19,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
* Speculative decoding
|
||||
* Easy-to-use web UI
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216).
|
||||
For the ful list of features, please refer to [server's changelog](https://github.com/ggml-org/llama.cpp/issues/9291)
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -289,69 +289,6 @@ For more details, please refer to [multimodal documentation](../../docs/multimod
|
||||
cmake --build build --config Release -t llama-server
|
||||
```
|
||||
|
||||
## Web UI
|
||||
|
||||
The project includes a web-based user interface for interacting with `llama-server`. It supports both single-model (`MODEL` mode) and multi-model (`ROUTER` mode) operation.
|
||||
|
||||
### Features
|
||||
|
||||
- **Chat interface** with streaming responses
|
||||
- **Multi-model support** (ROUTER mode) - switch between models, auto-load on selection
|
||||
- **Modality validation** - ensures selected model supports conversation's attachments (images, audio)
|
||||
- **Conversation management** - branching, regeneration, editing with history preservation
|
||||
- **Attachment support** - images, audio, PDFs (with vision/text fallback)
|
||||
- **Configurable parameters** - temperature, top_p, etc. synced with server defaults
|
||||
- **Dark/light theme**
|
||||
|
||||
### Tech Stack
|
||||
|
||||
- **SvelteKit** - frontend framework with Svelte 5 runes for reactive state
|
||||
- **TailwindCSS** + **shadcn-svelte** - styling and UI components
|
||||
- **Vite** - build tooling
|
||||
- **IndexedDB** (Dexie) - local storage for conversations
|
||||
- **LocalStorage** - user settings persistence
|
||||
|
||||
### Architecture
|
||||
|
||||
The WebUI follows a layered architecture:
|
||||
|
||||
```
|
||||
Routes → Components → Hooks → Stores → Services → Storage/API
|
||||
```
|
||||
|
||||
- **Stores** - reactive state management (`chatStore`, `conversationsStore`, `modelsStore`, `serverStore`, `settingsStore`)
|
||||
- **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`)
|
||||
- **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`)
|
||||
|
||||
For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/):
|
||||
|
||||
- `high-level-architecture.mmd` - full architecture with all modules
|
||||
- `high-level-architecture-simplified.mmd` - simplified overview
|
||||
- `data-flow-simplified-model-mode.mmd` - data flow for single-model mode
|
||||
- `data-flow-simplified-router-mode.mmd` - data flow for multi-model mode
|
||||
- `flows/*.mmd` - detailed per-domain flows (chat, conversations, models, etc.)
|
||||
|
||||
### Development
|
||||
|
||||
```sh
|
||||
# make sure you have Node.js installed
|
||||
cd tools/server/webui
|
||||
npm i
|
||||
|
||||
# run dev server (with hot reload)
|
||||
npm run dev
|
||||
|
||||
# run tests
|
||||
npm run test
|
||||
|
||||
# build production bundle
|
||||
npm run build
|
||||
```
|
||||
|
||||
After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.
|
||||
|
||||
**Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development.
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
@@ -380,7 +317,7 @@ docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:se
|
||||
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
|
||||
```
|
||||
|
||||
## Testing with CURL
|
||||
## Using with CURL
|
||||
|
||||
Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
|
||||
|
||||
@@ -391,46 +328,6 @@ curl --request POST \
|
||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||
```
|
||||
|
||||
## Advanced testing
|
||||
|
||||
We implemented a [server test framework](./tests/README.md) using human-readable scenario.
|
||||
|
||||
*Before submitting an issue, please try to reproduce it with this format.*
|
||||
|
||||
## Node JS Test
|
||||
|
||||
You need to have [Node.js](https://nodejs.org/en) installed.
|
||||
|
||||
```bash
|
||||
mkdir llama-client
|
||||
cd llama-client
|
||||
```
|
||||
|
||||
Create an index.js file and put this inside:
|
||||
|
||||
```javascript
|
||||
const prompt = "Building a website can be done in 10 simple steps:"
|
||||
|
||||
async function test() {
|
||||
let response = await fetch("http://127.0.0.1:8080/completion", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
prompt,
|
||||
n_predict: 64,
|
||||
})
|
||||
})
|
||||
console.log((await response.json()).content)
|
||||
}
|
||||
|
||||
test()
|
||||
```
|
||||
|
||||
And run it:
|
||||
|
||||
```bash
|
||||
node index.js
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### GET `/health`: Returns health check result
|
||||
@@ -495,6 +392,8 @@ By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to re
|
||||
|
||||
`n_cmpl`: Number of completions to generate from the current prompt. If input has multiple prompts, the output will have N prompts times `n_cmpl` entries.
|
||||
|
||||
`n_cache_reuse`: Min chunk size to attempt reusing from the cache via KV shifting. For more info, see `--cache-reuse` arg. Default: `0`, which is disabled.
|
||||
|
||||
`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
|
||||
|
||||
`stop`: Specify a JSON array of stopping strings.
|
||||
@@ -1636,6 +1535,22 @@ Response:
|
||||
}
|
||||
```
|
||||
|
||||
## API errors
|
||||
|
||||
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
||||
|
||||
Example of an error:
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": 401,
|
||||
"message": "Invalid API Key",
|
||||
"type": "authentication_error"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Interactive mode
|
||||
@@ -1655,26 +1570,6 @@ Run with bash:
|
||||
bash chat.sh
|
||||
```
|
||||
|
||||
### OAI-like API
|
||||
|
||||
The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
||||
|
||||
### API errors
|
||||
|
||||
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
||||
|
||||
Example of an error:
|
||||
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": 401,
|
||||
"message": "Invalid API Key",
|
||||
"type": "authentication_error"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
|
||||
|
||||
**When /metrics or /slots endpoint is disabled**
|
||||
|
||||
@@ -18,11 +18,13 @@ const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
||||
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
|
||||
|
||||
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
|
||||
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
||||
|
||||
@@ -102,6 +102,11 @@ struct server_slot {
|
||||
std::string generated_text;
|
||||
llama_tokens generated_tokens;
|
||||
|
||||
// idx of draft tokens in the main batch
|
||||
// non-empty if we went to evaluate draft tokens
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17808
|
||||
std::vector<int32_t> i_batch_dft;
|
||||
|
||||
std::vector<completion_token_output> generated_token_probs;
|
||||
|
||||
bool has_next_token = true;
|
||||
@@ -150,7 +155,8 @@ struct server_slot {
|
||||
|
||||
struct common_sampler * smpl = nullptr;
|
||||
|
||||
llama_token sampled;
|
||||
llama_token sampled; // in speculative mode, this is the last accepted token
|
||||
llama_tokens drafted;
|
||||
|
||||
// stats
|
||||
size_t n_sent_text = 0; // number of sent text character
|
||||
@@ -180,6 +186,8 @@ struct server_slot {
|
||||
stopping_word = "";
|
||||
n_sent_text = 0;
|
||||
|
||||
drafted.clear();
|
||||
i_batch_dft.clear();
|
||||
generated_tokens.clear();
|
||||
generated_token_probs.clear();
|
||||
json_schema = json();
|
||||
@@ -255,6 +263,31 @@ struct server_slot {
|
||||
generated_token_probs.push_back(token);
|
||||
}
|
||||
|
||||
int get_n_draft_max() const {
|
||||
if (!can_speculate()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// determine the max draft that fits the current slot state
|
||||
int n_draft_max = task->params.speculative.n_max;
|
||||
|
||||
// note: slot.prompt is not yet expanded with the `id` token sampled above
|
||||
// also, need to leave space for 1 extra token to allow context shifts
|
||||
n_draft_max = std::min(n_draft_max, n_ctx - prompt.n_tokens() - 2);
|
||||
|
||||
if (n_remaining > 0) {
|
||||
n_draft_max = std::min(n_draft_max, n_remaining - 1);
|
||||
}
|
||||
|
||||
SLT_DBG(*this, "max possible draft: %d\n", n_draft_max);
|
||||
|
||||
if (n_draft_max < task->params.speculative.n_min) {
|
||||
SLT_DBG(*this, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, task->params.speculative.n_min);
|
||||
n_draft_max = 0;
|
||||
}
|
||||
return n_draft_max;
|
||||
}
|
||||
|
||||
// note: a slot can also be either a parent or a child
|
||||
bool is_parent() const {
|
||||
return is_processing() && task->n_children > 0;
|
||||
@@ -353,8 +386,7 @@ struct server_slot {
|
||||
|
||||
if (n_draft_total > 0) {
|
||||
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
||||
SLT_INF(*this,
|
||||
"\n"
|
||||
SLT_CNT(*this,
|
||||
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total
|
||||
);
|
||||
@@ -1774,14 +1806,57 @@ struct server_context_impl {
|
||||
continue;
|
||||
}
|
||||
|
||||
slot.i_batch = batch.n_tokens;
|
||||
// generate draft tokens in speculative decoding mode
|
||||
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
|
||||
// perform the speculative drafting for all sequences at the same time in a single batch
|
||||
int n_draft_max = slot.get_n_draft_max();
|
||||
if (n_draft_max > 0) {
|
||||
if (mctx) {
|
||||
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
|
||||
GGML_ABORT("not supported by multimodal");
|
||||
}
|
||||
|
||||
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = n_draft_max;
|
||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max;
|
||||
params_spec.p_min = slot.task->params.speculative.p_min;
|
||||
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
|
||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
|
||||
|
||||
slot.prompt.tokens.push_back(slot.sampled);
|
||||
// add the sampled token to the batch
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
slot.prompt.tokens.push_back(slot.sampled);
|
||||
|
||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
|
||||
slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
|
||||
if (slot.task->params.speculative.n_min > (int) draft.size()) {
|
||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
|
||||
// fallback to normal decoding
|
||||
slot.i_batch = slot.i_batch_dft[0];
|
||||
slot.drafted.clear();
|
||||
slot.i_batch_dft.clear();
|
||||
} else {
|
||||
// keep track of total number of drafted tokens tested
|
||||
slot.n_draft_total += draft.size();
|
||||
|
||||
// add all drafted tokens to the batch
|
||||
for (size_t i = 0; i < draft.size(); i++) {
|
||||
slot.i_batch_dft.push_back(batch.n_tokens);
|
||||
common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
slot.prompt.tokens.push_back(draft[i]);
|
||||
}
|
||||
slot.drafted = std::move(draft);
|
||||
}
|
||||
} else {
|
||||
// no speculative decoding
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
|
||||
slot.prompt.tokens.push_back(slot.sampled);
|
||||
|
||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n",
|
||||
slot.n_ctx, slot.prompt.n_tokens(), slot.truncated);
|
||||
}
|
||||
}
|
||||
|
||||
// process in chunks of params.n_batch
|
||||
@@ -1880,8 +1955,18 @@ struct server_context_impl {
|
||||
n_past = std::min(n_past, slot.alora_invocation_start - 1);
|
||||
}
|
||||
|
||||
const auto n_cache_reuse = slot.task->params.n_cache_reuse;
|
||||
|
||||
const bool can_cache_reuse =
|
||||
llama_memory_can_shift(llama_get_memory(ctx)) &&
|
||||
!slot.prompt.tokens.has_mtmd;
|
||||
|
||||
if (!can_cache_reuse && n_cache_reuse > 0) {
|
||||
SLT_WRN(slot, "cache reuse is not supported - ignoring n_cache_reuse = %d\n", n_cache_reuse);
|
||||
}
|
||||
|
||||
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
||||
if (params_base.n_cache_reuse > 0) {
|
||||
if (can_cache_reuse && n_cache_reuse > 0) {
|
||||
GGML_ASSERT(!slot.prompt.tokens.has_mtmd);
|
||||
|
||||
size_t head_c = n_past; // cache
|
||||
@@ -1892,7 +1977,7 @@ struct server_context_impl {
|
||||
GGML_ABORT("not supported by multimodal");
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", params_base.n_cache_reuse, n_past);
|
||||
SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", n_cache_reuse, n_past);
|
||||
|
||||
while (head_c < slot.prompt.tokens.size() &&
|
||||
head_p < input_tokens.size()) {
|
||||
@@ -1901,11 +1986,10 @@ struct server_context_impl {
|
||||
while (head_c + n_match < slot.prompt.tokens.size() &&
|
||||
head_p + n_match < input_tokens.size() &&
|
||||
slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) {
|
||||
|
||||
n_match++;
|
||||
}
|
||||
|
||||
if (n_match >= (size_t) params_base.n_cache_reuse) {
|
||||
if (n_match >= (size_t) n_cache_reuse) {
|
||||
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||
@@ -2336,6 +2420,10 @@ struct server_context_impl {
|
||||
// on successful decode, restore the original batch size
|
||||
n_batch = llama_n_batch(ctx);
|
||||
|
||||
// technically, measuring the time here excludes the sampling time for the last batch
|
||||
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
for (auto & slot : slots) {
|
||||
// may need to copy state to other slots
|
||||
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
|
||||
@@ -2390,6 +2478,10 @@ struct server_context_impl {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
if (slot.i_batch_dft.size() > 0) {
|
||||
continue; // sample using speculative decoding
|
||||
}
|
||||
|
||||
const int tok_idx = slot.i_batch - i;
|
||||
|
||||
llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
|
||||
@@ -2400,8 +2492,6 @@ struct server_context_impl {
|
||||
|
||||
slot.n_decoded += 1;
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
|
||||
if (slot.n_decoded == 1) {
|
||||
slot.t_start_generation = t_current;
|
||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||
@@ -2430,84 +2520,32 @@ struct server_context_impl {
|
||||
}
|
||||
}
|
||||
|
||||
// do speculative decoding
|
||||
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
|
||||
// perform the speculative drafting for all sequences at the same time in a single batch
|
||||
// speculative decoding - main model sample and accept
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() || !slot.can_speculate()) {
|
||||
if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (slot.state != SLOT_STATE_GENERATING) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mctx) {
|
||||
// we should never reach this, as speculative is automatically disabled if mmproj is loaded
|
||||
GGML_ABORT("not supported by multimodal");
|
||||
}
|
||||
|
||||
// determine the max draft that fits the current slot state
|
||||
int n_draft_max = slot.task->params.speculative.n_max;
|
||||
|
||||
// note: slot.prompt is not yet expanded with the `id` token sampled above
|
||||
// also, need to leave space for 1 extra token to allow context shifts
|
||||
n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.prompt.n_tokens() - 2);
|
||||
|
||||
if (slot.n_remaining > 0) {
|
||||
n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "max possible draft: %d\n", n_draft_max);
|
||||
|
||||
if (n_draft_max < slot.task->params.speculative.n_min) {
|
||||
SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.task->params.speculative.n_min);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_token id = slot.sampled;
|
||||
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = n_draft_max;
|
||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max;
|
||||
params_spec.p_min = slot.task->params.speculative.p_min;
|
||||
|
||||
const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
|
||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
|
||||
|
||||
// ignore small drafts
|
||||
if (slot.task->params.speculative.n_min > (int) draft.size()) {
|
||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of total number of drafted tokens tested
|
||||
slot.n_draft_total += draft.size();
|
||||
|
||||
// construct the speculation batch
|
||||
common_batch_clear(slot.batch_spec);
|
||||
common_batch_add (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true);
|
||||
|
||||
for (size_t i = 0; i < draft.size(); ++i) {
|
||||
common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true);
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
|
||||
|
||||
llama_decode(ctx, slot.batch_spec);
|
||||
size_t n_draft = slot.drafted.size();
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, slot.i_batch_dft, slot.drafted);
|
||||
slot.i_batch_dft.clear();
|
||||
slot.drafted.clear();
|
||||
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;
|
||||
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
|
||||
slot.prompt.tokens.push_back(id);
|
||||
// rollback to the state before sampling the draft tokens
|
||||
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
|
||||
|
||||
// add accepted tokens to the prompt
|
||||
slot.prompt.tokens.insert({ids.begin(), ids.end() - 1});
|
||||
slot.sampled = ids.back(); // last accepted token
|
||||
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1);
|
||||
|
||||
@@ -2530,7 +2568,7 @@ struct server_context_impl {
|
||||
}
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.prompt.n_tokens());
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) slot.drafted.size(), slot.prompt.n_tokens());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2551,6 +2589,10 @@ struct server_context_impl {
|
||||
int get_slot_n_ctx() {
|
||||
return slots.back().n_ctx;
|
||||
}
|
||||
|
||||
server_response_reader get_response_reader() {
|
||||
return server_response_reader(queue_tasks, queue_results, HTTP_POLLING_SECONDS);
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
@@ -2580,8 +2622,8 @@ llama_context * server_context::get_llama_context() const {
|
||||
return impl->ctx;
|
||||
}
|
||||
|
||||
std::pair<server_queue &, server_response &> server_context::get_queues() {
|
||||
return { impl->queue_tasks, impl->queue_results };
|
||||
server_response_reader server_context::get_response_reader() {
|
||||
return impl->get_response_reader();
|
||||
}
|
||||
|
||||
|
||||
@@ -2590,7 +2632,7 @@ std::pair<server_queue &, server_response &> server_context::get_queues() {
|
||||
struct server_res_generator : server_http_res {
|
||||
server_response_reader rd;
|
||||
server_res_generator(server_context_impl & ctx_server)
|
||||
: rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS) {}
|
||||
: rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {}
|
||||
void ok(const json & response_data) {
|
||||
status = 200;
|
||||
data = safe_json_to_str(response_data);
|
||||
@@ -2623,9 +2665,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
try {
|
||||
std::vector<server_task> tasks;
|
||||
|
||||
// tracking generation state and partial tool calls
|
||||
std::vector<task_result_state> states;
|
||||
|
||||
const auto & prompt = data.at("prompt");
|
||||
// TODO: this log can become very long, put it behind a flag or think about a more compact format
|
||||
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
|
||||
@@ -2641,7 +2680,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
||||
}
|
||||
tasks.reserve(inputs.size());
|
||||
states.reserve(inputs.size());
|
||||
int idx = 0;
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
server_task task = server_task(type);
|
||||
@@ -2660,7 +2698,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
task.params.res_type = res_type;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
task.params.oaicompat_model = ctx_server.model_name;
|
||||
states.push_back(task.params.oaicompat_chat_syntax);
|
||||
|
||||
if (task.params.n_cmpl > 1) {
|
||||
task.n_children = task.params.n_cmpl - 1;
|
||||
@@ -2669,7 +2706,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
task.id,
|
||||
ctx_server.queue_tasks.get_new_id(),
|
||||
idx++);
|
||||
states.push_back(child.params.oaicompat_chat_syntax);
|
||||
tasks.push_back(std::move(child));
|
||||
}
|
||||
}
|
||||
@@ -2677,7 +2713,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
rd.set_states(std::move(states));
|
||||
rd.post_tasks(std::move(tasks));
|
||||
} catch (const std::exception & e) {
|
||||
res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
||||
@@ -3407,7 +3442,7 @@ void server_routes::init_routes() {
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS);
|
||||
server_response_reader rd = ctx_server.get_response_reader();
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
tasks.reserve(documents.size());
|
||||
@@ -3667,7 +3702,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS);
|
||||
server_response_reader rd = ctx_server.get_response_reader();
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
|
||||
|
||||
@@ -31,9 +31,8 @@ struct server_context {
|
||||
// get the underlaying llama_context
|
||||
llama_context * get_llama_context() const;
|
||||
|
||||
// get the underlaying queue_tasks and queue_results
|
||||
// used by CLI application
|
||||
std::pair<server_queue &, server_response &> get_queues();
|
||||
// get a new response reader, used by CLI application
|
||||
server_response_reader get_response_reader();
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -271,12 +271,21 @@ void server_response::terminate() {
|
||||
// server_response_reader
|
||||
//
|
||||
|
||||
void server_response_reader::set_states(std::vector<task_result_state> && states) {
|
||||
this->states = std::move(states);
|
||||
void server_response_reader::post_task(server_task && task) {
|
||||
GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
|
||||
id_tasks.insert(task.id);
|
||||
states.push_back(task.create_state());
|
||||
queue_results.add_waiting_task_id(task.id);
|
||||
queue_tasks.post(std::move(task));
|
||||
}
|
||||
|
||||
void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
|
||||
GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
|
||||
id_tasks = server_task::get_list_id(tasks);
|
||||
states.reserve(tasks.size());
|
||||
for (size_t i = 0; i < tasks.size(); i++) {
|
||||
states.push_back(tasks[i].create_state());
|
||||
}
|
||||
queue_results.add_waiting_tasks(tasks);
|
||||
queue_tasks.post(std::move(tasks));
|
||||
}
|
||||
|
||||
@@ -129,13 +129,13 @@ struct server_response_reader {
|
||||
std::vector<task_result_state> states;
|
||||
|
||||
// should_stop function will be called each polling_interval_seconds
|
||||
server_response_reader(std::pair<server_queue &, server_response &> server_queues, int polling_interval_seconds)
|
||||
: queue_tasks(server_queues.first), queue_results(server_queues.second), polling_interval_seconds(polling_interval_seconds) {}
|
||||
server_response_reader(server_queue & queue_tasks, server_response & queue_results, int polling_interval_seconds)
|
||||
: queue_tasks(queue_tasks), queue_results(queue_results), polling_interval_seconds(polling_interval_seconds) {}
|
||||
~server_response_reader() {
|
||||
stop();
|
||||
}
|
||||
|
||||
void set_states(std::vector<task_result_state> && states);
|
||||
void post_task(server_task && tasks);
|
||||
void post_tasks(std::vector<server_task> && tasks);
|
||||
bool has_next() const;
|
||||
|
||||
|
||||
@@ -155,11 +155,12 @@ task_params server_task::params_from_json_cmpl(
|
||||
|
||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
|
||||
task_params defaults;
|
||||
defaults.sampling = params_base.sampling;
|
||||
defaults.speculative = params_base.speculative;
|
||||
defaults.n_keep = params_base.n_keep;
|
||||
defaults.n_predict = params_base.n_predict;
|
||||
defaults.antiprompt = params_base.antiprompt;
|
||||
defaults.sampling = params_base.sampling;
|
||||
defaults.speculative = params_base.speculative;
|
||||
defaults.n_keep = params_base.n_keep;
|
||||
defaults.n_predict = params_base.n_predict;
|
||||
defaults.n_cache_reuse = params_base.n_cache_reuse;
|
||||
defaults.antiprompt = params_base.antiprompt;
|
||||
|
||||
// enabling this will output extra debug information in the HTTP responses from the server
|
||||
params.verbose = params_base.verbosity > 9;
|
||||
@@ -176,6 +177,7 @@ task_params server_task::params_from_json_cmpl(
|
||||
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
||||
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
|
||||
params.n_cmpl = json_value(data, "n_cmpl", json_value(data, "n", 1));
|
||||
params.n_cache_reuse = json_value(data, "n_cache_reuse", defaults.n_cache_reuse);
|
||||
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
|
||||
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
|
||||
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
|
||||
|
||||
@@ -55,6 +55,8 @@ struct task_params {
|
||||
int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters
|
||||
int32_t n_cmpl = 1; // number of completions to generate from this prompt
|
||||
|
||||
int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled)
|
||||
|
||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
@@ -62,18 +64,19 @@ struct task_params {
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
std::vector<std::string> response_fields;
|
||||
bool timings_per_token = false;
|
||||
|
||||
bool timings_per_token = false;
|
||||
bool post_sampling_probs = false;
|
||||
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
|
||||
// response formatting
|
||||
bool verbose = false;
|
||||
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
bool verbose = false;
|
||||
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
||||
std::string oaicompat_model;
|
||||
std::string oaicompat_cmpl_id;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
|
||||
// Embeddings
|
||||
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
||||
@@ -82,6 +85,25 @@ struct task_params {
|
||||
json to_json(bool only_metrics = false) const;
|
||||
};
|
||||
|
||||
// struct for tracking the state of a task (e.g., for streaming)
|
||||
struct task_result_state {
|
||||
// tracking diffs for partial tool calls
|
||||
std::vector<common_chat_msg_diff> diffs;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
common_chat_msg chat_msg;
|
||||
std::string generated_text; // append new chunks of generated text here
|
||||
std::vector<std::string> generated_tool_call_ids;
|
||||
|
||||
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
|
||||
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
|
||||
|
||||
// parse partial tool calls and update the internal state
|
||||
common_chat_msg update_chat_msg(
|
||||
const std::string & text_added,
|
||||
bool is_partial,
|
||||
std::vector<common_chat_msg_diff> & diffs);
|
||||
};
|
||||
|
||||
struct server_task {
|
||||
int id = -1; // to be filled by server_queue
|
||||
int index = -1; // used when there are multiple prompts (batch request)
|
||||
@@ -146,6 +168,12 @@ struct server_task {
|
||||
copy.tokens = tokens.clone();
|
||||
return copy;
|
||||
}
|
||||
|
||||
// the task will be moved into queue, then onto slots
|
||||
// however, the state must be kept by caller (e.g., HTTP thread)
|
||||
task_result_state create_state() const {
|
||||
return task_result_state(params.oaicompat_chat_syntax);
|
||||
}
|
||||
};
|
||||
|
||||
struct result_timings {
|
||||
@@ -177,25 +205,6 @@ struct result_prompt_progress {
|
||||
json to_json() const;
|
||||
};
|
||||
|
||||
// struct for tracking the state of a task (e.g., for streaming)
|
||||
struct task_result_state {
|
||||
// tracking diffs for partial tool calls
|
||||
std::vector<common_chat_msg_diff> diffs;
|
||||
common_chat_syntax oaicompat_chat_syntax;
|
||||
common_chat_msg chat_msg;
|
||||
std::string generated_text; // append new chunks of generated text here
|
||||
std::vector<std::string> generated_tool_call_ids;
|
||||
|
||||
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
|
||||
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
|
||||
|
||||
// parse partial tool calls and update the internal state
|
||||
common_chat_msg update_chat_msg(
|
||||
const std::string & text_added,
|
||||
bool is_partial,
|
||||
std::vector<common_chat_msg_diff> & diffs);
|
||||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id_slot = -1;
|
||||
|
||||
Reference in New Issue
Block a user