mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
48cda24c11 | ||
|
|
871f1a2d2f | ||
|
|
20197b6fe3 | ||
|
|
ba38f3becc | ||
|
|
37f230dd7c | ||
|
|
a308e584ca | ||
|
|
d0fa2c9fbb | ||
|
|
9bcb4eff4d | ||
|
|
6861f6509a |
@@ -4,7 +4,7 @@
|
||||
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CHIP_TYPE=910b
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
|
||||
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
|
||||
2
.github/workflows/build-cann.yml
vendored
2
.github/workflows/build-cann.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -907,7 +907,7 @@ jobs:
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
|
||||
@@ -1079,7 +1079,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.verbose_prompt = true;
|
||||
}
|
||||
));
|
||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||
add_opt(common_arg(
|
||||
{"--display-prompt"},
|
||||
{"--no-display-prompt"},
|
||||
@@ -2843,6 +2843,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.webui_mcp_proxy = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
|
||||
add_opt(common_arg(
|
||||
{"--tools"}, "TOOL1,TOOL2,...",
|
||||
"experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
|
||||
"specify \"all\" to enable all tools\n"
|
||||
"available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.server_tools = parse_csv_row(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
|
||||
add_opt(common_arg(
|
||||
{"--webui"},
|
||||
{"--no-webui"},
|
||||
|
||||
@@ -613,6 +613,9 @@ struct common_params {
|
||||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
// enable built-in tools
|
||||
std::vector<std::string> server_tools;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
std::string models_preset = ""; // directory containing model presets for the router server
|
||||
|
||||
@@ -42,12 +42,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
|
||||
### Ascend NPU
|
||||
|
||||
**Verified devices**
|
||||
You can retrieve your Ascend device IDs using the following command:
|
||||
|
||||
| Ascend NPU | Status |
|
||||
|:-----------------------------:|:-------:|
|
||||
| Atlas 300T A2 | Support |
|
||||
| Atlas 300I Duo | Support |
|
||||
```sh
|
||||
lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
|
||||
```
|
||||
|
||||
**Devices**
|
||||
|
||||
| Device Id | Product Series | Product Models | Chip Model | Verified Status |
|
||||
|:---------:|----------------|----------------|:----------:|:---------------:|
|
||||
| d803 | Atlas A3 Train | | 910C | |
|
||||
| d803 | Atlas A3 Infer | | 910C | |
|
||||
| d802 | Atlas A2 Train | | 910B | |
|
||||
| d802 | Atlas A2 Infer | Atlas 300I A2 | 910B | Support |
|
||||
| d801 | Atlas Train | | 910 | |
|
||||
| d500 | Atlas Infer | Atlas 300I Duo | 310P | Support |
|
||||
|
||||
*Notes:*
|
||||
|
||||
@@ -57,6 +67,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
|
||||
## Model Supports
|
||||
|
||||
<details>
|
||||
<summary>Text-only</summary>
|
||||
|
||||
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||
|:----------------------------|:-----:|:----:|:----:|
|
||||
| Llama-2 | √ | √ | √ |
|
||||
@@ -118,8 +131,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
| Trillion-7B-preview | √ | √ | √ |
|
||||
| Ling models | √ | √ | √ |
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Multimodal</summary>
|
||||
|
||||
**Multimodal**
|
||||
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||
|:----------------------------|:-----:|:----:|:----:|
|
||||
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
|
||||
@@ -134,15 +150,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
| GLM-EDGE | √ | √ | √ |
|
||||
| Qwen2-VL | √ | √ | √ |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
|
||||
## DataType Supports
|
||||
|
||||
| DataType | Status |
|
||||
|:----------------------:|:-------:|
|
||||
| FP16 | Support |
|
||||
| Q8_0 | Support |
|
||||
| Q4_0 | Support |
|
||||
| DataType | 910B | 310P |
|
||||
|:----------------------:|:-------:|:-------:|
|
||||
| FP16 | Support | Support |
|
||||
| Q8_0 | Support | Partial |
|
||||
| Q4_0 | Support | Partial |
|
||||
| BF16 | Support | |
|
||||
|
||||
> **310P note**
|
||||
> - `Q8_0`: data transform / buffer path is implemented, and `GET_ROWS` is supported, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
|
||||
> - `Q4_0`: data transform / buffer path is implemented, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
|
||||
|
||||
## Docker
|
||||
|
||||
@@ -160,7 +183,20 @@ npu-smi info
|
||||
|
||||
# Select the cards that you want to use, make sure these cards are not used by someone.
|
||||
# Following using cards of device0.
|
||||
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
|
||||
docker run --name llamacpp \
|
||||
--device /dev/davinci0 \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /PATH_TO_YOUR_MODELS/:/app/models \
|
||||
-it llama-cpp-cann \
|
||||
-m /app/models/MODEL_PATH \
|
||||
-ngl 32 \
|
||||
-p "Building a website can be done in 10 simple steps:"
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
@@ -171,69 +207,57 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
|
||||
|
||||
### I. Setup Environment
|
||||
|
||||
1. **Install Ascend Driver and firmware**
|
||||
1. **Configure Ascend user and group**
|
||||
|
||||
```sh
|
||||
# create driver running user.
|
||||
sudo groupadd -g HwHiAiUser
|
||||
sudo groupadd HwHiAiUser
|
||||
sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
|
||||
sudo usermod -aG HwHiAiUser $USER
|
||||
|
||||
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
|
||||
# and install driver.
|
||||
sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
|
||||
```
|
||||
|
||||
Once installed, run `npu-smi info` to check whether driver is installed successfully.
|
||||
2. **Install dependencies**
|
||||
|
||||
**Ubuntu/Debian:**
|
||||
```sh
|
||||
+-------------------------------------------------------------------------------------------+
|
||||
| npu-smi 24.1.rc2 Version: 24.1.rc2 |
|
||||
+----------------------+---------------+----------------------------------------------------+
|
||||
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
|
||||
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
|
||||
+======================+===============+====================================================+
|
||||
| 2 xxx | OK | 64.4 51 15 / 15 |
|
||||
| 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 |
|
||||
+======================+===============+====================================================+
|
||||
| 5 xxx | OK | 64.0 52 15 / 15 |
|
||||
| 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 |
|
||||
+======================+===============+====================================================+
|
||||
| No running processes found in NPU 2 |
|
||||
+======================+===============+====================================================+
|
||||
| No running processes found in NPU 5 |
|
||||
+======================+===============+====================================================+
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc python3 python3-pip linux-headers-$(uname -r)
|
||||
```
|
||||
|
||||
2. **Install Ascend Firmware**
|
||||
**RHEL/CentOS:**
|
||||
```sh
|
||||
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
|
||||
# and install driver.
|
||||
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
|
||||
sudo yum makecache
|
||||
sudo yum install -y gcc python3 python3-pip kernel-headers-$(uname -r) kernel-devel-$(uname -r)
|
||||
```
|
||||
If the following message appears, firmware is installed successfully.
|
||||
|
||||
3. **Install CANN (driver + toolkit)**
|
||||
|
||||
> The `Ascend-cann` package includes both the driver and toolkit.
|
||||
> `$ARCH` can be `x86_64` or `aarch64`, `$CHIP` can be `910b` or `310p`.
|
||||
|
||||
```sh
|
||||
Firmware package installed successfully!
|
||||
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann_8.5.0_linux-$ARCH.run
|
||||
sudo bash ./Ascend-cann_8.5.0_linux-$ARCH.run --install
|
||||
|
||||
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run
|
||||
sudo bash ./Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run --install
|
||||
```
|
||||
|
||||
4. **Verify installation**
|
||||
|
||||
3. **Install CANN toolkit and kernels**
|
||||
|
||||
CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
|
||||
|
||||
Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
|
||||
```sh
|
||||
pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
|
||||
sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
|
||||
sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
|
||||
npu-smi info
|
||||
```
|
||||
|
||||
Set Ascend Variables:
|
||||
If device information is displayed correctly, the driver is functioning properly.
|
||||
|
||||
```sh
|
||||
echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
|
||||
source ~/.bashrc
|
||||
# Set environment variables (adjust path if needed)
|
||||
source /usr/local/Ascend/cann/set_env.sh
|
||||
|
||||
python3 -c "import acl; print(acl.get_soc_name())"
|
||||
```
|
||||
|
||||
Upon a successful installation, CANN is enabled for the available ascend devices.
|
||||
If the command outputs the chip model, the installation was successful.
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
|
||||
@@ -690,7 +690,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" matmul2d_descriptor(16, 16, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
@@ -740,7 +740,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" matmul2d_descriptor(16, 16, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
|
||||
@@ -589,8 +589,10 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
} else {
|
||||
result.buffer = 0;
|
||||
result.data = 0;
|
||||
}
|
||||
for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
result.ne[i] = tensor->ne[i];
|
||||
@@ -606,7 +608,6 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
}
|
||||
result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
|
||||
result.view_offs = tensor->view_offs;
|
||||
result.data = reinterpret_cast<uint64_t>(tensor->data);
|
||||
|
||||
// Avoid sending uninitialized data over the wire
|
||||
memset(result.name, 0, sizeof(result.name));
|
||||
@@ -1443,9 +1444,11 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
|
||||
const rpc_tensor * tensor = it_ptr->second;
|
||||
|
||||
struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
|
||||
if (result == nullptr || result->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] invalid tensor: null %s (id=%" PRIu64 ")\n",
|
||||
__func__, result == nullptr ? "tensor" : "buffer", id);
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (result->buffer == nullptr && result->data != nullptr) {
|
||||
GGML_LOG_ERROR("[%s] invalid data ptr", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
tensor_map[id] = result;
|
||||
|
||||
@@ -146,13 +146,19 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ctx = llama_init->context();
|
||||
model = llama_init->model();
|
||||
smpl = llama_init->sampler(0);
|
||||
|
||||
if (ctx == NULL) {
|
||||
LOG_ERR("%s: error: unable to create context\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
smpl = llama_init->sampler(0);
|
||||
|
||||
llama_memory_t mem = llama_get_memory(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
@@ -695,7 +701,7 @@ int main(int argc, char ** argv) {
|
||||
if (!common_prompt_batch_decode(ctx, embd, n_past, params.n_batch, path_session, save_now)) {
|
||||
return 1;
|
||||
}
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.begin());
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
|
||||
n_session_consumed = session_tokens.size();
|
||||
session_do_save = false;
|
||||
|
||||
|
||||
@@ -1377,6 +1377,16 @@ struct clip_model_loader {
|
||||
|
||||
// sanity check
|
||||
{
|
||||
if (hparams.image_size < 0) {
|
||||
// note: some models having hparams.image_size == 0, which means the image size is dynamic
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.patch_size <= 0) {
|
||||
throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
|
||||
}
|
||||
if (hparams.n_embd <= 0) {
|
||||
throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd));
|
||||
}
|
||||
if (hparams.image_max_pixels < hparams.image_min_pixels) {
|
||||
throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
|
||||
}
|
||||
|
||||
@@ -13,23 +13,20 @@
|
||||
|
||||
constexpr bool DEBUG = false;
|
||||
|
||||
void mtmd_audio_cache::fill_sin_cos_table(int n) {
|
||||
void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) {
|
||||
sin_vals.resize(n);
|
||||
cos_vals.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (uint32_t i = 0; i < n; i++) {
|
||||
double theta = (2 * M_PI * i) / n;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
|
||||
void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
|
||||
hann_window.resize(length);
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
int offset = periodic ? 0 : -1;
|
||||
for (uint32_t i = 0; i < length; i++) {
|
||||
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
@@ -165,6 +162,7 @@ static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, fl
|
||||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
GGML_ASSERT(N > 0);
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
|
||||
if (N == 1) {
|
||||
@@ -407,6 +405,8 @@ static bool log_mel_spectrogram(
|
||||
}
|
||||
|
||||
|
||||
GGML_ASSERT(params.n_fft_bins > 0);
|
||||
GGML_ASSERT(params.hop_length > 0);
|
||||
out.n_mel = params.n_mel;
|
||||
out.n_len = (n_samples - frame_size) / frame_step + 1;
|
||||
// TODO: handle these checks better
|
||||
@@ -438,6 +438,7 @@ static bool log_mel_spectrogram(
|
||||
|
||||
const int effective_n_len = n_samples_in / frame_step;
|
||||
if (params.norm_per_feature) {
|
||||
GGML_ASSERT(effective_n_len > 1);
|
||||
for (int i = 0; i < out.n_mel; i++) {
|
||||
double mean = 0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
@@ -639,6 +640,7 @@ mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length
|
||||
padding_to_remove((n_fft - hop_length) / 2),
|
||||
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
|
||||
ifft_out(n_fft * 2 * 4, 0.0f) {
|
||||
GGML_ASSERT(n_fft > 0 && hop_length > 0 && hop_length <= n_fft);
|
||||
cache.fill_sin_cos_table(n_fft);
|
||||
cache.fill_hann_window(n_fft, true);
|
||||
}
|
||||
|
||||
@@ -33,9 +33,9 @@ struct mtmd_audio_cache {
|
||||
|
||||
mtmd_audio_mel_filters filters;
|
||||
|
||||
void fill_sin_cos_table(int n);
|
||||
void fill_sin_cos_table(uint32_t n);
|
||||
|
||||
void fill_hann_window(int length, bool periodic);
|
||||
void fill_hann_window(uint32_t length, bool periodic);
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
|
||||
@@ -127,6 +127,7 @@ struct decode_embd_batch {
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
GGML_ASSERT(n_tokens > 0 && n_pos_per_embd > 0 && n_mmproj_embd > 0);
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -157,6 +158,7 @@ struct decode_embd_batch {
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
@@ -192,6 +194,7 @@ struct decode_embd_batch {
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
GGML_ASSERT(offset >= 0 && n_tokens > 0 && offset + n_tokens <= batch.n_tokens);
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
@@ -235,6 +238,7 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
@@ -312,6 +316,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
@@ -508,6 +513,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
if (file_size < 0) {
|
||||
LOG_ERR("Failed to get file size of %s\n", fname);
|
||||
fclose(f);
|
||||
return nullptr;
|
||||
}
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
|
||||
@@ -99,6 +99,8 @@ struct img_tool {
|
||||
}
|
||||
|
||||
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
|
||||
GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
|
||||
GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
|
||||
dst.nx = w;
|
||||
dst.ny = h;
|
||||
dst.buf.resize(3 * w * h);
|
||||
@@ -196,6 +198,7 @@ struct img_tool {
|
||||
private:
|
||||
// Bilinear resize function
|
||||
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
|
||||
GGML_ASSERT(src.nx >= 2 && src.ny >= 2);
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
@@ -207,8 +210,8 @@ private:
|
||||
for (int x = 0; x < target_width; x++) {
|
||||
float px = x_ratio * x;
|
||||
float py = y_ratio * y;
|
||||
int x_floor = static_cast<int>(px);
|
||||
int y_floor = static_cast<int>(py);
|
||||
int x_floor = std::min(static_cast<int>(px), src.nx - 2);
|
||||
int y_floor = std::min(static_cast<int>(py), src.ny - 2);
|
||||
float x_lerp = px - x_floor;
|
||||
float y_lerp = py - y_floor;
|
||||
|
||||
@@ -347,6 +350,7 @@ private:
|
||||
// Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
|
||||
auto precompute_weights = [&](int inSize, int outSize,
|
||||
std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
|
||||
GGML_ASSERT(inSize > 0 && outSize > 0);
|
||||
double support, scale, filterscale;
|
||||
double center, ww, ss;
|
||||
int xx, x, ksize, xmin, xmax, xcnt;
|
||||
|
||||
@@ -641,6 +641,11 @@ struct mtmd_tokenizer {
|
||||
add_text(ctx->img_beg, true); // add image begin token
|
||||
}
|
||||
|
||||
// sanity check
|
||||
GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
|
||||
GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmap->nx;
|
||||
@@ -649,7 +654,6 @@ struct mtmd_tokenizer {
|
||||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
|
||||
if (!ok) {
|
||||
@@ -773,6 +777,11 @@ struct mtmd_tokenizer {
|
||||
add_text(ctx->aud_beg, true); // add audio begin token
|
||||
}
|
||||
|
||||
// sanity check
|
||||
GGML_ASSERT(ctx->audio_preproc != nullptr);
|
||||
GGML_ASSERT(bitmap->data.size() > sizeof(float));
|
||||
GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
|
||||
|
||||
// preprocess audio
|
||||
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
|
||||
@@ -13,6 +13,8 @@ add_library(${TARGET} STATIC
|
||||
server-common.h
|
||||
server-context.cpp
|
||||
server-context.h
|
||||
server-tools.cpp
|
||||
server-tools.h
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
||||
@@ -125,6 +125,61 @@ The framework automatically starts a `llama-server` instance, sends requests, an
|
||||
|
||||
For detailed instructions, see the [test documentation](./tests/README.md).
|
||||
|
||||
### API for tools
|
||||
|
||||
This endpoint is intended to be used internally by the Web UI and subject to change or to be removed in the future.
|
||||
|
||||
**GET /tools**
|
||||
|
||||
Get a list of tools, each tool has these fields:
|
||||
- `tool` (string): the ID name of the tool, to be used in POST call. Example: `read_file`
|
||||
- `display_name` (string): the name to be displayed on UI. Example: `Read file`
|
||||
- `type` (string): always be `"builtin"` for now
|
||||
- `permissions` (object): a mapping string --> boolean that indicates the permission required by this tool. This is useful for the UI to ask the user before calling the tool. For now, the only permission supported is `"write"`
|
||||
- `definition` (object): the OAI-compat definition of this tool
|
||||
|
||||
**POST /tools**
|
||||
|
||||
Invoke a tool call, request body is a JSON object with:
|
||||
- `tool` (string): the name of the tool
|
||||
- `params` (object): a mapping from argument name (string) to argument value
|
||||
|
||||
Returns JSON object. There are two response formats:
|
||||
|
||||
Format 1: Plain text. The text will be placed into a field called `plain_text_response`, example:
|
||||
|
||||
```json
|
||||
{
|
||||
"plain_text_response": "this is a text response"
|
||||
}
|
||||
```
|
||||
|
||||
The client should extract this value and place it inside message content (note: content is no longer a JSON), example
|
||||
|
||||
```json
|
||||
{
|
||||
"role": "tool",
|
||||
"content": "this is a text response"
|
||||
}
|
||||
```
|
||||
|
||||
Format 2: Normal JSON response, example:
|
||||
|
||||
```json
|
||||
{
|
||||
"error": "cannot open this file"
|
||||
}
|
||||
```
|
||||
|
||||
That requires `JSON.stringify` when formatted to message content:
|
||||
|
||||
```json
|
||||
{
|
||||
"role": "tool",
|
||||
"content": "{\"error\":\"cannot open this file\"}"
|
||||
}
|
||||
```
|
||||
|
||||
### Notable Related PRs
|
||||
|
||||
- Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443
|
||||
|
||||
@@ -36,7 +36,6 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--license` | show source code license and dependencies |
|
||||
| `-cl, --cache-list` | show list of models in cache |
|
||||
| `--completion-bash` | print source-able bash completion script for llama.cpp |
|
||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||
@@ -194,6 +193,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
|
||||
| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
|
||||
| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_WEBUI_MCP_PROXY) |
|
||||
| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff<br/>(env: LLAMA_ARG_TOOLS) |
|
||||
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||
@@ -293,6 +293,12 @@ It is currently available in the following endpoints:
|
||||
|
||||
For more details, please refer to [multimodal documentation](../../docs/multimodal.md)
|
||||
|
||||
### Built-in tools support
|
||||
|
||||
The server includes a set of built-in tools that enable the LLM to access the local file system directly from the Web UI.
|
||||
|
||||
To use this feature, start the server with `--tools all`. You can also enable only specific tools by passing a comma-separated list: `--tools name1,name2,...`. Run `--help` for the full list of available tool names.
|
||||
|
||||
## Build
|
||||
|
||||
`llama-server` is built alongside everything else from the root of the project
|
||||
@@ -1438,6 +1444,14 @@ curl http://localhost:8080/v1/messages/count_tokens \
|
||||
{"input_tokens": 10}
|
||||
```
|
||||
|
||||
## Server built-in tools
|
||||
|
||||
The server exposes a REST API under `/tools` that allows the Web UI to call built-in tools. This endpoint is intended to be used internally by the Web UI and subject to change or to be removed in the future.
|
||||
|
||||
**Please do NOT use this endpoint in a downstream application**
|
||||
|
||||
For further documentation about this endpoint, please refer to [server internal documentation](./README-dev.md)
|
||||
|
||||
## Using multiple models
|
||||
|
||||
`llama-server` can be launched in a **router mode** that exposes an API for dynamically loading and unloading models. The main process (the "router") automatically forwards each request to the appropriate model instance.
|
||||
|
||||
Binary file not shown.
800
tools/server/server-tools.cpp
Normal file
800
tools/server/server-tools.cpp
Normal file
@@ -0,0 +1,800 @@
|
||||
#include "server-tools.h"
|
||||
|
||||
#include <sheredom/subprocess.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
#include <climits>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
//
|
||||
// internal helpers
|
||||
//
|
||||
|
||||
static std::vector<char *> to_cstr_vec(const std::vector<std::string> & v) {
|
||||
std::vector<char *> r;
|
||||
r.reserve(v.size() + 1);
|
||||
for (const auto & s : v) {
|
||||
r.push_back(const_cast<char *>(s.c_str()));
|
||||
}
|
||||
r.push_back(nullptr);
|
||||
return r;
|
||||
}
|
||||
|
||||
struct run_proc_result {
|
||||
std::string output;
|
||||
int exit_code = -1;
|
||||
bool timed_out = false;
|
||||
};
|
||||
|
||||
static run_proc_result run_process(
|
||||
const std::vector<std::string> & args,
|
||||
size_t max_output,
|
||||
int timeout_secs) {
|
||||
run_proc_result res;
|
||||
|
||||
subprocess_s proc;
|
||||
auto argv = to_cstr_vec(args);
|
||||
|
||||
int options = subprocess_option_no_window
|
||||
| subprocess_option_combined_stdout_stderr
|
||||
| subprocess_option_inherit_environment
|
||||
| subprocess_option_search_user_path;
|
||||
|
||||
if (subprocess_create(argv.data(), options, &proc) != 0) {
|
||||
res.output = "failed to spawn process";
|
||||
return res;
|
||||
}
|
||||
|
||||
std::atomic<bool> done{false};
|
||||
std::atomic<bool> timed_out{false};
|
||||
|
||||
std::thread timeout_thread([&]() {
|
||||
auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_secs);
|
||||
while (!done.load()) {
|
||||
if (std::chrono::steady_clock::now() >= deadline) {
|
||||
timed_out.store(true);
|
||||
subprocess_terminate(&proc);
|
||||
return;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
}
|
||||
});
|
||||
|
||||
FILE * f = subprocess_stdout(&proc);
|
||||
std::string output;
|
||||
bool truncated = false;
|
||||
if (f) {
|
||||
char buf[4096];
|
||||
while (fgets(buf, sizeof(buf), f) != nullptr) {
|
||||
if (!truncated) {
|
||||
size_t len = strlen(buf);
|
||||
if (output.size() + len <= max_output) {
|
||||
output.append(buf, len);
|
||||
} else {
|
||||
output.append(buf, max_output - output.size());
|
||||
truncated = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
done.store(true);
|
||||
if (timeout_thread.joinable()) {
|
||||
timeout_thread.join();
|
||||
}
|
||||
|
||||
subprocess_join(&proc, &res.exit_code);
|
||||
subprocess_destroy(&proc);
|
||||
|
||||
res.output = output;
|
||||
res.timed_out = timed_out.load();
|
||||
if (truncated) {
|
||||
res.output += "\n[output truncated]";
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// simple glob: * matches non-/ chars, ** matches anything including /
|
||||
static bool glob_match(const char * pattern, const char * str) {
|
||||
if (*pattern == '\0') {
|
||||
return *str == '\0';
|
||||
}
|
||||
if (pattern[0] == '*' && pattern[1] == '*') {
|
||||
const char * p = pattern + 2;
|
||||
if (*p == '/') p++;
|
||||
if (glob_match(p, str)) return true;
|
||||
if (*str != '\0') return glob_match(pattern, str + 1);
|
||||
return false;
|
||||
}
|
||||
if (*pattern == '*') {
|
||||
const char * p = pattern + 1;
|
||||
for (; *str != '\0' && *str != '/'; str++) {
|
||||
if (glob_match(p, str)) return true;
|
||||
}
|
||||
return glob_match(p, str);
|
||||
}
|
||||
if (*pattern == '?' && *str != '\0' && *str != '/') {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
if (*pattern == *str) {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool glob_match(const std::string & pattern, const std::string & str) {
|
||||
return glob_match(pattern.c_str(), str.c_str());
|
||||
}
|
||||
|
||||
json server_tool::to_json() {
|
||||
return {
|
||||
{"display_name", display_name},
|
||||
{"tool", name},
|
||||
{"type", "builtin"},
|
||||
{"permissions", json{
|
||||
{"write", permission_write}
|
||||
}},
|
||||
{"definition", get_definition()},
|
||||
};
|
||||
}
|
||||
|
||||
//
|
||||
// read_file: read a file with optional line range and line-number prefix
|
||||
//
|
||||
|
||||
static constexpr size_t SERVER_TOOL_READ_FILE_MAX_SIZE = 16 * 1024; // 16 KB
|
||||
|
||||
struct server_tool_read_file : server_tool {
|
||||
server_tool_read_file() {
|
||||
name = "read_file";
|
||||
display_name = "Read file";
|
||||
permission_write = false;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description", "Read the contents of a file. Optionally specify a 1-based line range. "
|
||||
"If append_loc is true, each line is prefixed with its line number (e.g. \"1\u2192 ...\")."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"path", {{"type", "string"}, {"description", "Path to the file"}}},
|
||||
{"start_line", {{"type", "integer"}, {"description", "First line to read, 1-based (default: 1)"}}},
|
||||
{"end_line", {{"type", "integer"}, {"description", "Last line to read, 1-based inclusive (default: end of file)"}}},
|
||||
{"append_loc", {{"type", "boolean"}, {"description", "Prefix each line with its line number"}}},
|
||||
}},
|
||||
{"required", json::array({"path"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string path = params.at("path").get<std::string>();
|
||||
int start_line = json_value(params, "start_line", 1);
|
||||
int end_line = json_value(params, "end_line", -1); // -1 = no limit
|
||||
bool append_loc = json_value(params, "append_loc", false);
|
||||
|
||||
std::error_code ec;
|
||||
uintmax_t file_size = fs::file_size(path, ec);
|
||||
if (ec) {
|
||||
return {{"error", "cannot stat file: " + ec.message()}};
|
||||
}
|
||||
if (file_size > SERVER_TOOL_READ_FILE_MAX_SIZE && end_line == -1) {
|
||||
return {{"error", string_format(
|
||||
"file too large (%zu bytes, max %zu). Use start_line/end_line to read a portion.",
|
||||
(size_t)file_size, SERVER_TOOL_READ_FILE_MAX_SIZE)}};
|
||||
}
|
||||
|
||||
std::ifstream f(path);
|
||||
if (!f) {
|
||||
return {{"error", "failed to open file: " + path}};
|
||||
}
|
||||
|
||||
std::string result;
|
||||
std::string line;
|
||||
int lineno = 0;
|
||||
|
||||
while (std::getline(f, line)) {
|
||||
lineno++;
|
||||
if (lineno < start_line) continue;
|
||||
if (end_line != -1 && lineno > end_line) break;
|
||||
|
||||
std::string out_line;
|
||||
if (append_loc) {
|
||||
out_line = std::to_string(lineno) + "\u2192 " + line + "\n";
|
||||
} else {
|
||||
out_line = line + "\n";
|
||||
}
|
||||
|
||||
if (result.size() + out_line.size() > SERVER_TOOL_READ_FILE_MAX_SIZE) {
|
||||
result += "[output truncated]";
|
||||
break;
|
||||
}
|
||||
result += out_line;
|
||||
}
|
||||
|
||||
return {{"plain_text_response", result}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// file_glob_search: find files matching a glob pattern under a base directory
|
||||
//
|
||||
|
||||
static constexpr size_t SERVER_TOOL_FILE_SEARCH_MAX_RESULTS = 100;
|
||||
|
||||
struct server_tool_file_glob_search : server_tool {
|
||||
server_tool_file_glob_search() {
|
||||
name = "file_glob_search";
|
||||
display_name = "File search";
|
||||
permission_write = false;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description", "Recursively search for files matching a glob pattern under a directory."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"path", {{"type", "string"}, {"description", "Base directory to search in"}}},
|
||||
{"include", {{"type", "string"}, {"description", "Glob pattern for files to include (e.g. \"**/*.cpp\"). Default: **"}}},
|
||||
{"exclude", {{"type", "string"}, {"description", "Glob pattern for files to exclude"}}},
|
||||
}},
|
||||
{"required", json::array({"path"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string base = params.at("path").get<std::string>();
|
||||
std::string include = json_value(params, "include", std::string("**"));
|
||||
std::string exclude = json_value(params, "exclude", std::string(""));
|
||||
|
||||
std::ostringstream output_text;
|
||||
size_t count = 0;
|
||||
|
||||
std::error_code ec;
|
||||
for (const auto & entry : fs::recursive_directory_iterator(base,
|
||||
fs::directory_options::skip_permission_denied, ec)) {
|
||||
if (!entry.is_regular_file()) continue;
|
||||
|
||||
std::string rel = fs::relative(entry.path(), base, ec).string();
|
||||
if (ec) continue;
|
||||
std::replace(rel.begin(), rel.end(), '\\', '/');
|
||||
|
||||
if (!glob_match(include, rel)) continue;
|
||||
if (!exclude.empty() && glob_match(exclude, rel)) continue;
|
||||
|
||||
output_text << entry.path().string() << "\n";
|
||||
if (++count >= SERVER_TOOL_FILE_SEARCH_MAX_RESULTS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
output_text << "\n---\nTotal matches: " << count << "\n";
|
||||
|
||||
return {{"plain_text_response", output_text.str()}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// grep_search: search for a regex pattern in files
|
||||
//
|
||||
|
||||
static constexpr size_t SERVER_TOOL_GREP_SEARCH_MAX_RESULTS = 100;
|
||||
|
||||
struct server_tool_grep_search : server_tool {
|
||||
server_tool_grep_search() {
|
||||
name = "grep_search";
|
||||
display_name = "Grep search";
|
||||
permission_write = false;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description", "Search for a regex pattern in files under a path. Returns matching lines."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"path", {{"type", "string"}, {"description", "File or directory to search in"}}},
|
||||
{"pattern", {{"type", "string"}, {"description", "Regular expression pattern to search for"}}},
|
||||
{"include", {{"type", "string"}, {"description", "Glob pattern to filter files (default: **)"}}},
|
||||
{"exclude", {{"type", "string"}, {"description", "Glob pattern to exclude files"}}},
|
||||
{"return_line_numbers", {{"type", "boolean"}, {"description", "If true, include line numbers in results"}}},
|
||||
}},
|
||||
{"required", json::array({"path", "pattern"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string path = params.at("path").get<std::string>();
|
||||
std::string pat_str = params.at("pattern").get<std::string>();
|
||||
std::string include = json_value(params, "include", std::string("**"));
|
||||
std::string exclude = json_value(params, "exclude", std::string(""));
|
||||
bool show_lineno = json_value(params, "return_line_numbers", false);
|
||||
|
||||
std::regex pattern;
|
||||
try {
|
||||
pattern = std::regex(pat_str);
|
||||
} catch (const std::regex_error & e) {
|
||||
return {{"error", std::string("invalid regex: ") + e.what()}};
|
||||
}
|
||||
|
||||
std::ostringstream output_text;
|
||||
size_t total = 0;
|
||||
|
||||
auto search_file = [&](const fs::path & fpath) {
|
||||
std::ifstream f(fpath);
|
||||
if (!f) return;
|
||||
std::string line;
|
||||
int lineno = 0;
|
||||
while (std::getline(f, line) && total < SERVER_TOOL_GREP_SEARCH_MAX_RESULTS) {
|
||||
lineno++;
|
||||
if (std::regex_search(line, pattern)) {
|
||||
output_text << fpath.string() << ":";
|
||||
if (show_lineno) {
|
||||
output_text << lineno << ":";
|
||||
}
|
||||
output_text << line << "\n";
|
||||
total++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::error_code ec;
|
||||
if (fs::is_regular_file(path, ec)) {
|
||||
search_file(path);
|
||||
} else if (fs::is_directory(path, ec)) {
|
||||
for (const auto & entry : fs::recursive_directory_iterator(path,
|
||||
fs::directory_options::skip_permission_denied, ec)) {
|
||||
if (!entry.is_regular_file()) continue;
|
||||
if (total >= SERVER_TOOL_GREP_SEARCH_MAX_RESULTS) break;
|
||||
|
||||
std::string rel = fs::relative(entry.path(), path, ec).string();
|
||||
if (ec) continue;
|
||||
std::replace(rel.begin(), rel.end(), '\\', '/');
|
||||
|
||||
if (!glob_match(include, rel)) continue;
|
||||
if (!exclude.empty() && glob_match(exclude, rel)) continue;
|
||||
|
||||
search_file(entry.path());
|
||||
}
|
||||
} else {
|
||||
return {{"error", "path does not exist: " + path}};
|
||||
}
|
||||
|
||||
output_text << "\n\n---\nTotal matches: " << total << "\n";
|
||||
|
||||
return {{"plain_text_response", output_text.str()}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// exec_shell_command: run an arbitrary shell command
|
||||
//
|
||||
|
||||
static constexpr size_t SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE = 16 * 1024; // 16 KB
|
||||
static constexpr int SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT = 60; // seconds
|
||||
|
||||
struct server_tool_exec_shell_command : server_tool {
|
||||
server_tool_exec_shell_command() {
|
||||
name = "exec_shell_command";
|
||||
display_name = "Execute shell command";
|
||||
permission_write = true;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description", "Execute a shell command and return its output (stdout and stderr combined)."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"command", {{"type", "string"}, {"description", "Shell command to execute"}}},
|
||||
{"timeout", {{"type", "integer"}, {"description", string_format("Timeout in seconds (default 10, max %d)", SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT)}}},
|
||||
{"max_output_size", {{"type", "integer"}, {"description", string_format("Maximum output size in bytes (default %zu)", SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE)}}},
|
||||
}},
|
||||
{"required", json::array({"command"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string command = params.at("command").get<std::string>();
|
||||
int timeout = json_value(params, "timeout", 10);
|
||||
size_t max_output = (size_t) json_value(params, "max_output_size", (int) SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE);
|
||||
|
||||
timeout = std::min(timeout, SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_TIMEOUT);
|
||||
max_output = std::min(max_output, SERVER_TOOL_EXEC_SHELL_COMMAND_MAX_OUTPUT_SIZE);
|
||||
|
||||
#ifdef _WIN32
|
||||
std::vector<std::string> args = {"cmd", "/c", command};
|
||||
#else
|
||||
std::vector<std::string> args = {"sh", "-c", command};
|
||||
#endif
|
||||
|
||||
auto res = run_process(args, max_output, timeout);
|
||||
|
||||
std::string text_output = res.output;
|
||||
text_output += string_format("\n[exit code: %d]", res.exit_code);
|
||||
if (res.timed_out) {
|
||||
text_output += " [exit due to timed out]";
|
||||
}
|
||||
|
||||
return {{"plain_text_response", text_output}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// write_file: create or overwrite a file
|
||||
//
|
||||
|
||||
struct server_tool_write_file : server_tool {
|
||||
server_tool_write_file() {
|
||||
name = "write_file";
|
||||
display_name = "Write file";
|
||||
permission_write = true;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description", "Write content to a file, creating it (including parent directories) if it does not exist. May use with edit_file for more complex edits."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"path", {{"type", "string"}, {"description", "Path of the file to write"}}},
|
||||
{"content", {{"type", "string"}, {"description", "Content to write"}}},
|
||||
}},
|
||||
{"required", json::array({"path", "content"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string path = params.at("path").get<std::string>();
|
||||
std::string content = params.at("content").get<std::string>();
|
||||
|
||||
std::error_code ec;
|
||||
fs::path fpath(path);
|
||||
if (fpath.has_parent_path()) {
|
||||
fs::create_directories(fpath.parent_path(), ec);
|
||||
if (ec) {
|
||||
return {{"error", "failed to create directories: " + ec.message()}};
|
||||
}
|
||||
}
|
||||
|
||||
std::ofstream f(path, std::ios::binary);
|
||||
if (!f) {
|
||||
return {{"error", "failed to open file for writing: " + path}};
|
||||
}
|
||||
f << content;
|
||||
if (!f) {
|
||||
return {{"error", "failed to write file: " + path}};
|
||||
}
|
||||
|
||||
return {{"result", "file written successfully"}, {"path", path}, {"bytes", content.size()}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// edit_file: edit file content via line-based changes
|
||||
//
|
||||
|
||||
struct server_tool_edit_file : server_tool {
|
||||
server_tool_edit_file() {
|
||||
name = "edit_file";
|
||||
display_name = "Edit file";
|
||||
permission_write = true;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description",
|
||||
"Edit a file by applying a list of line-based changes. "
|
||||
"Each change targets a 1-based inclusive line range and has a mode: "
|
||||
"\"replace\" (replace lines with content), "
|
||||
"\"delete\" (remove lines, content must be empty string), "
|
||||
"\"append\" (insert content after line_end). "
|
||||
"Set line_start to -1 to target the end of file (line_end is ignored in that case). "
|
||||
"Changes must not overlap. They are applied in reverse line order automatically."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"path", {{"type", "string"}, {"description", "Path to the file to edit"}}},
|
||||
{"changes", {
|
||||
{"type", "array"},
|
||||
{"description", "List of changes to apply"},
|
||||
{"items", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"mode", {{"type", "string"}, {"description", "\"replace\", \"delete\", or \"append\""}}},
|
||||
{"line_start", {{"type", "integer"}, {"description", "First line of the range (1-based); use -1 for end of file"}}},
|
||||
{"line_end", {{"type", "integer"}, {"description", "Last line of the range (1-based, inclusive); ignored when line_start is -1"}}},
|
||||
{"content", {{"type", "string"}, {"description", "Content to insert; must be empty string for delete mode"}}},
|
||||
}},
|
||||
{"required", json::array({"mode", "line_start", "line_end", "content"})},
|
||||
}},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({"path", "changes"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string path = params.at("path").get<std::string>();
|
||||
const json & changes = params.at("changes");
|
||||
|
||||
if (!changes.is_array()) {
|
||||
return {{"error", "\"changes\" must be an array"}};
|
||||
}
|
||||
|
||||
// read file into lines
|
||||
std::ifstream fin(path);
|
||||
if (!fin) {
|
||||
return {{"error", "failed to open file: " + path}};
|
||||
}
|
||||
std::vector<std::string> lines;
|
||||
{
|
||||
std::string line;
|
||||
while (std::getline(fin, line)) {
|
||||
lines.push_back(line);
|
||||
}
|
||||
}
|
||||
fin.close();
|
||||
|
||||
// validate and collect changes, then sort descending by line_start
|
||||
struct change_entry {
|
||||
std::string mode;
|
||||
int line_start; // 1-based
|
||||
int line_end; // 1-based inclusive
|
||||
std::string content;
|
||||
};
|
||||
std::vector<change_entry> entries;
|
||||
entries.reserve(changes.size());
|
||||
|
||||
for (const auto & ch : changes) {
|
||||
change_entry e;
|
||||
e.mode = ch.at("mode").get<std::string>();
|
||||
e.line_start = ch.at("line_start").get<int>();
|
||||
e.line_end = ch.at("line_end").get<int>();
|
||||
e.content = ch.at("content").get<std::string>();
|
||||
|
||||
if (e.mode != "replace" && e.mode != "delete" && e.mode != "append") {
|
||||
return {{"error", "invalid mode \"" + e.mode + "\"; must be replace, delete, or append"}};
|
||||
}
|
||||
if (e.mode == "delete" && !e.content.empty()) {
|
||||
return {{"error", "content must be empty string for delete mode"}};
|
||||
}
|
||||
int n = (int) lines.size();
|
||||
if (e.line_start == -1) {
|
||||
// -1 means end of file; line_end is ignored — normalize to point past last line
|
||||
e.line_start = n + 1;
|
||||
e.line_end = n + 1;
|
||||
} else {
|
||||
if (e.line_start < 1 || e.line_end < e.line_start) {
|
||||
return {{"error", string_format("invalid line range [%d, %d]", e.line_start, e.line_end)}};
|
||||
}
|
||||
if (e.line_end > n) {
|
||||
return {{"error", string_format("line_end %d exceeds file length %d", e.line_end, n)}};
|
||||
}
|
||||
}
|
||||
entries.push_back(std::move(e));
|
||||
}
|
||||
|
||||
// sort descending so earlier-indexed changes don't shift later ones
|
||||
std::sort(entries.begin(), entries.end(), [](const change_entry & a, const change_entry & b) {
|
||||
return a.line_start > b.line_start;
|
||||
});
|
||||
|
||||
// apply changes (0-based indices internally)
|
||||
for (const auto & e : entries) {
|
||||
int idx_start = e.line_start - 1; // 0-based
|
||||
int idx_end = e.line_end - 1; // 0-based inclusive
|
||||
|
||||
// split content into lines (preserve trailing newline awareness)
|
||||
std::vector<std::string> new_lines;
|
||||
if (!e.content.empty()) {
|
||||
std::istringstream ss(e.content);
|
||||
std::string ln;
|
||||
while (std::getline(ss, ln)) {
|
||||
new_lines.push_back(ln);
|
||||
}
|
||||
// if content ends with \n, getline consumed it — no extra empty line needed
|
||||
// if content does NOT end with \n, last line is still captured correctly
|
||||
}
|
||||
|
||||
if (e.mode == "replace") {
|
||||
// erase [idx_start, idx_end] and insert new_lines
|
||||
lines.erase(lines.begin() + idx_start, lines.begin() + idx_end + 1);
|
||||
lines.insert(lines.begin() + idx_start, new_lines.begin(), new_lines.end());
|
||||
} else if (e.mode == "delete") {
|
||||
lines.erase(lines.begin() + idx_start, lines.begin() + idx_end + 1);
|
||||
} else { // append
|
||||
// idx_end + 1 may equal lines.size() when line_start == -1 (end of file)
|
||||
lines.insert(lines.begin() + idx_end + 1, new_lines.begin(), new_lines.end());
|
||||
}
|
||||
}
|
||||
|
||||
// write file back
|
||||
std::ofstream fout(path, std::ios::binary);
|
||||
if (!fout) {
|
||||
return {{"error", "failed to open file for writing: " + path}};
|
||||
}
|
||||
for (size_t i = 0; i < lines.size(); i++) {
|
||||
fout << lines[i];
|
||||
if (i + 1 < lines.size()) {
|
||||
fout << "\n";
|
||||
}
|
||||
}
|
||||
if (!lines.empty()) {
|
||||
fout << "\n";
|
||||
}
|
||||
if (!fout) {
|
||||
return {{"error", "failed to write file: " + path}};
|
||||
}
|
||||
|
||||
return {{"result", "file edited successfully"}, {"path", path}, {"lines", (int) lines.size()}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// apply_diff: apply a unified diff via git apply
|
||||
//
|
||||
|
||||
struct server_tool_apply_diff : server_tool {
|
||||
server_tool_apply_diff() {
|
||||
name = "apply_diff";
|
||||
display_name = "Apply diff";
|
||||
permission_write = true;
|
||||
}
|
||||
|
||||
json get_definition() override {
|
||||
return {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", name},
|
||||
{"description", "Apply a unified diff to edit one or more files using git apply. Use this instead of edit_file when the changes are complex."},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"diff", {{"type", "string"}, {"description", "Unified diff content in git diff format"}}},
|
||||
}},
|
||||
{"required", json::array({"diff"})},
|
||||
}},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
json invoke(json params) override {
|
||||
std::string diff = params.at("diff").get<std::string>();
|
||||
|
||||
// write diff to a temporary file
|
||||
static std::atomic<int> counter{0};
|
||||
std::string tmp_path = (fs::temp_directory_path() /
|
||||
("llama_patch_" + std::to_string(++counter) + ".patch")).string();
|
||||
|
||||
{
|
||||
std::ofstream f(tmp_path, std::ios::binary);
|
||||
if (!f) {
|
||||
return {{"error", "failed to create temp patch file"}};
|
||||
}
|
||||
f << diff;
|
||||
}
|
||||
|
||||
auto res = run_process({"git", "apply", tmp_path}, 4096, 10);
|
||||
|
||||
std::error_code ec;
|
||||
fs::remove(tmp_path, ec);
|
||||
|
||||
if (res.exit_code != 0) {
|
||||
return {{"error", "git apply failed (exit " + std::to_string(res.exit_code) + "): " + res.output}};
|
||||
}
|
||||
return {{"result", "patch applied successfully"}};
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// public API
|
||||
//
|
||||
|
||||
static std::vector<std::unique_ptr<server_tool>> build_tools() {
|
||||
std::vector<std::unique_ptr<server_tool>> tools;
|
||||
tools.push_back(std::make_unique<server_tool_read_file>());
|
||||
tools.push_back(std::make_unique<server_tool_file_glob_search>());
|
||||
tools.push_back(std::make_unique<server_tool_grep_search>());
|
||||
tools.push_back(std::make_unique<server_tool_exec_shell_command>());
|
||||
tools.push_back(std::make_unique<server_tool_write_file>());
|
||||
tools.push_back(std::make_unique<server_tool_edit_file>());
|
||||
tools.push_back(std::make_unique<server_tool_apply_diff>());
|
||||
return tools;
|
||||
}
|
||||
|
||||
void server_tools::setup(const std::vector<std::string> & enabled_tools) {
|
||||
if (!enabled_tools.empty()) {
|
||||
std::unordered_set<std::string> enabled_set(enabled_tools.begin(), enabled_tools.end());
|
||||
auto all_tools = build_tools();
|
||||
|
||||
tools.clear();
|
||||
for (auto & t : all_tools) {
|
||||
if (enabled_set.count(t->name) > 0 || enabled_set.count("all") > 0) {
|
||||
tools.push_back(std::move(t));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
handle_get = [this](const server_http_req &) -> server_http_res_ptr {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
try {
|
||||
json result = json::array();
|
||||
for (const auto & t : tools) {
|
||||
result.push_back(t->to_json());
|
||||
}
|
||||
res->data = safe_json_to_str(result);
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("got exception: %s\n", e.what());
|
||||
res->status = 500;
|
||||
res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER));
|
||||
}
|
||||
return res;
|
||||
};
|
||||
|
||||
handle_post = [this](const server_http_req & req) -> server_http_res_ptr {
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
try {
|
||||
json body = json::parse(req.body);
|
||||
std::string tool_name = body.at("tool").get<std::string>();
|
||||
json params = body.value("params", json::object());
|
||||
json result = invoke(tool_name, params);
|
||||
res->data = safe_json_to_str(result);
|
||||
} catch (const json::exception & e) {
|
||||
res->status = 400;
|
||||
res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("got exception: %s\n", e.what());
|
||||
res->status = 500;
|
||||
res->data = safe_json_to_str(format_error_response(e.what(), ERROR_TYPE_SERVER));
|
||||
}
|
||||
return res;
|
||||
};
|
||||
}
|
||||
|
||||
json server_tools::invoke(const std::string & name, const json & params) {
|
||||
for (auto & t : tools) {
|
||||
if (t->name == name) {
|
||||
return t->invoke(params);
|
||||
}
|
||||
}
|
||||
return {{"error", "unknown tool: " + name}};
|
||||
}
|
||||
26
tools/server/server-tools.h
Normal file
26
tools/server/server-tools.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
#include "server-common.h"
|
||||
#include "server-http.h"
|
||||
|
||||
struct server_tool {
|
||||
std::string name;
|
||||
std::string display_name;
|
||||
bool permission_write = false;
|
||||
|
||||
virtual ~server_tool() = default;
|
||||
virtual json get_definition() = 0;
|
||||
virtual json invoke(json params) = 0;
|
||||
|
||||
json to_json();
|
||||
};
|
||||
|
||||
struct server_tools {
|
||||
std::vector<std::unique_ptr<server_tool>> tools;
|
||||
|
||||
void setup(const std::vector<std::string> & enabled_tools);
|
||||
json invoke(const std::string & name, const json & params);
|
||||
|
||||
server_http_context::handler_t handle_get;
|
||||
server_http_context::handler_t handle_post;
|
||||
};
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "server-http.h"
|
||||
#include "server-models.h"
|
||||
#include "server-cors-proxy.h"
|
||||
#include "server-tools.h"
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
@@ -124,6 +125,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// register API routes
|
||||
server_routes routes(params, ctx_server);
|
||||
server_tools tools;
|
||||
|
||||
bool is_router_server = params.model.path.empty();
|
||||
std::optional<server_models_routes> models_routes{};
|
||||
@@ -211,6 +213,16 @@ int main(int argc, char ** argv) {
|
||||
ctx_http.get ("/cors-proxy", ex_wrapper(proxy_handler_get));
|
||||
ctx_http.post("/cors-proxy", ex_wrapper(proxy_handler_post));
|
||||
}
|
||||
// EXPERIMENTAL built-in tools
|
||||
if (!params.server_tools.empty()) {
|
||||
tools.setup(params.server_tools);
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
SRV_WRN("%s", "Built-in tools are enabled, do not expose server to untrusted environments\n");
|
||||
SRV_WRN("%s", "This feature is EXPERIMENTAL and may be changed in the future\n");
|
||||
SRV_WRN("%s", "-----------------\n");
|
||||
ctx_http.get ("/tools", ex_wrapper(tools.handle_get));
|
||||
ctx_http.post("/tools", ex_wrapper(tools.handle_post));
|
||||
}
|
||||
|
||||
//
|
||||
// Start the server
|
||||
|
||||
@@ -296,6 +296,11 @@
|
||||
label: 'Disable reasoning content parsing',
|
||||
type: SettingsFieldType.CHECKBOX
|
||||
},
|
||||
{
|
||||
key: SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,
|
||||
label: 'Exclude reasoning from context',
|
||||
type: SettingsFieldType.CHECKBOX
|
||||
},
|
||||
{
|
||||
key: SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,
|
||||
label: 'Enable raw output toggle',
|
||||
|
||||
@@ -50,6 +50,8 @@ export const AGENTIC_REGEX = {
|
||||
PARTIAL_MARKER: /<<<[A-Za-z_]*$/,
|
||||
// Matches reasoning content blocks (including tags)
|
||||
REASONING_BLOCK: /<<<reasoning_content_start>>>[\s\S]*?<<<reasoning_content_end>>>/g,
|
||||
// Captures the reasoning text between start/end tags
|
||||
REASONING_EXTRACT: /<<<reasoning_content_start>>>([\s\S]*?)<<<reasoning_content_end>>>/,
|
||||
// Matches an opening reasoning tag and any remaining content (unterminated)
|
||||
REASONING_OPEN: /<<<reasoning_content_start>>>[\s\S]*$/,
|
||||
// Matches a complete agentic tool call display block (start to end marker)
|
||||
|
||||
@@ -10,6 +10,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean |
|
||||
theme: ColorMode.SYSTEM,
|
||||
showThoughtInProgress: false,
|
||||
disableReasoningParsing: false,
|
||||
excludeReasoningFromContext: false,
|
||||
showRawOutputSwitch: false,
|
||||
keepStatsVisible: false,
|
||||
showMessageStats: true,
|
||||
@@ -106,6 +107,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
||||
showThoughtInProgress: 'Expand thought process by default when generating messages.',
|
||||
disableReasoningParsing:
|
||||
'Send reasoning_format=none to prevent server-side extraction of reasoning tokens into separate field',
|
||||
excludeReasoningFromContext:
|
||||
'Strip reasoning content from previous messages before sending to the model. When unchecked, reasoning is sent back via the reasoning_content field so the model can see its own chain-of-thought across turns.',
|
||||
showRawOutputSwitch:
|
||||
'Show toggle button to display messages as plain text instead of Markdown-formatted content',
|
||||
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
|
||||
|
||||
@@ -54,6 +54,7 @@ export const SETTINGS_KEYS = {
|
||||
SHOW_TOOL_CALL_IN_PROGRESS: 'showToolCallInProgress',
|
||||
// Developer
|
||||
DISABLE_REASONING_PARSING: 'disableReasoningParsing',
|
||||
EXCLUDE_REASONING_FROM_CONTEXT: 'excludeReasoningFromContext',
|
||||
SHOW_RAW_OUTPUT_SWITCH: 'showRawOutputSwitch',
|
||||
CUSTOM: 'custom'
|
||||
} as const;
|
||||
|
||||
@@ -57,6 +57,46 @@ export class ChatService {
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Extracts reasoning text from content that contains internal reasoning tags.
|
||||
* Returns the concatenated reasoning content or undefined if none found.
|
||||
*/
|
||||
private static extractReasoningFromContent(
|
||||
content: ApiChatMessageData['content'] | null | undefined
|
||||
): string | undefined {
|
||||
if (!content) return undefined;
|
||||
|
||||
const extractFromString = (text: string): string => {
|
||||
const parts: string[] = [];
|
||||
// Use a fresh regex instance to avoid shared lastIndex state
|
||||
const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
|
||||
let match = re.exec(text);
|
||||
while (match) {
|
||||
parts.push(match[1]);
|
||||
// advance past the matched portion and retry
|
||||
text = text.slice(match.index + match[0].length);
|
||||
match = re.exec(text);
|
||||
}
|
||||
return parts.join('');
|
||||
};
|
||||
|
||||
if (typeof content === 'string') {
|
||||
const result = extractFromString(content);
|
||||
return result || undefined;
|
||||
}
|
||||
|
||||
if (!Array.isArray(content)) return undefined;
|
||||
|
||||
const parts: string[] = [];
|
||||
for (const part of content) {
|
||||
if (part.type === ContentPartType.TEXT && part.text) {
|
||||
const result = extractFromString(part.text);
|
||||
if (result) parts.push(result);
|
||||
}
|
||||
}
|
||||
return parts.length > 0 ? parts.join('') : undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a chat completion request to the llama.cpp server.
|
||||
* Supports both streaming and non-streaming responses with comprehensive parameter configuration.
|
||||
@@ -111,7 +151,8 @@ export class ChatService {
|
||||
custom,
|
||||
timings_per_token,
|
||||
// Config options
|
||||
disableReasoningParsing
|
||||
disableReasoningParsing,
|
||||
excludeReasoningFromContext
|
||||
} = options;
|
||||
|
||||
const normalizedMessages: ApiChatMessageData[] = messages
|
||||
@@ -159,14 +200,24 @@ export class ChatService {
|
||||
}
|
||||
|
||||
const requestBody: ApiChatCompletionRequest = {
|
||||
messages: normalizedMessages.map((msg: ApiChatMessageData) => ({
|
||||
role: msg.role,
|
||||
// Strip reasoning tags/content from the prompt to avoid polluting KV cache.
|
||||
// TODO: investigate backend expectations for reasoning tags and add a toggle if needed.
|
||||
content: ChatService.stripReasoningContent(msg.content),
|
||||
tool_calls: msg.tool_calls,
|
||||
tool_call_id: msg.tool_call_id
|
||||
})),
|
||||
messages: normalizedMessages.map((msg: ApiChatMessageData) => {
|
||||
// Always strip internal reasoning/agentic tags from content
|
||||
const cleanedContent = ChatService.stripReasoningContent(msg.content);
|
||||
const mapped: ApiChatCompletionRequest['messages'][0] = {
|
||||
role: msg.role,
|
||||
content: cleanedContent,
|
||||
tool_calls: msg.tool_calls,
|
||||
tool_call_id: msg.tool_call_id
|
||||
};
|
||||
// When preserving reasoning, extract it from raw content and send as separate field
|
||||
if (!excludeReasoningFromContext) {
|
||||
const reasoning = ChatService.extractReasoningFromContent(msg.content);
|
||||
if (reasoning) {
|
||||
mapped.reasoning_content = reasoning;
|
||||
}
|
||||
}
|
||||
return mapped;
|
||||
}),
|
||||
stream,
|
||||
return_progress: stream ? true : undefined,
|
||||
tools: tools && tools.length > 0 ? tools : undefined
|
||||
|
||||
@@ -227,6 +227,12 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
|
||||
serverKey: 'alwaysShowAgenticTurns',
|
||||
type: SyncableParameterType.BOOLEAN,
|
||||
canSync: true
|
||||
},
|
||||
{
|
||||
key: 'excludeReasoningFromContext',
|
||||
serverKey: 'excludeReasoningFromContext',
|
||||
type: SyncableParameterType.BOOLEAN,
|
||||
canSync: true
|
||||
}
|
||||
];
|
||||
|
||||
|
||||
@@ -1479,6 +1479,8 @@ class ChatStore {
|
||||
|
||||
if (currentConfig.disableReasoningParsing) apiOptions.disableReasoningParsing = true;
|
||||
|
||||
if (currentConfig.excludeReasoningFromContext) apiOptions.excludeReasoningFromContext = true;
|
||||
|
||||
if (hasValue(currentConfig.temperature))
|
||||
apiOptions.temperature = Number(currentConfig.temperature);
|
||||
|
||||
|
||||
4
tools/server/webui/src/lib/types/api.d.ts
vendored
4
tools/server/webui/src/lib/types/api.d.ts
vendored
@@ -45,6 +45,7 @@ export interface ApiErrorResponse {
|
||||
export interface ApiChatMessageData {
|
||||
role: ChatRole;
|
||||
content: string | ApiChatMessageContentPart[];
|
||||
reasoning_content?: string;
|
||||
tool_calls?: ApiChatCompletionToolCall[];
|
||||
tool_call_id?: string;
|
||||
timestamp?: number;
|
||||
@@ -201,6 +202,9 @@ export interface ApiChatCompletionRequest {
|
||||
messages: Array<{
|
||||
role: ChatRole;
|
||||
content: string | ApiChatMessageContentPart[];
|
||||
reasoning_content?: string;
|
||||
tool_calls?: ApiChatCompletionToolCall[];
|
||||
tool_call_id?: string;
|
||||
}>;
|
||||
stream?: boolean;
|
||||
model?: string;
|
||||
|
||||
@@ -24,6 +24,8 @@ export interface SettingsChatServiceOptions {
|
||||
systemMessage?: string;
|
||||
// Disable reasoning parsing (use 'none' instead of 'auto')
|
||||
disableReasoningParsing?: boolean;
|
||||
// Strip reasoning content from context before sending
|
||||
excludeReasoningFromContext?: boolean;
|
||||
tools?: OpenAIToolDefinition[];
|
||||
// Generation parameters
|
||||
temperature?: number;
|
||||
|
||||
196
tools/server/webui/tests/unit/reasoning-context.test.ts
Normal file
196
tools/server/webui/tests/unit/reasoning-context.test.ts
Normal file
@@ -0,0 +1,196 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { AGENTIC_REGEX, REASONING_TAGS } from '$lib/constants/agentic';
|
||||
import { ContentPartType } from '$lib/enums';
|
||||
|
||||
// Replicate ChatService.extractReasoningFromContent (private static)
|
||||
function extractReasoningFromContent(
|
||||
content: string | Array<{ type: string; text?: string }> | null | undefined
|
||||
): string | undefined {
|
||||
if (!content) return undefined;
|
||||
|
||||
const extractFromString = (text: string): string => {
|
||||
const parts: string[] = [];
|
||||
const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
|
||||
let match = re.exec(text);
|
||||
while (match) {
|
||||
parts.push(match[1]);
|
||||
text = text.slice(match.index + match[0].length);
|
||||
match = re.exec(text);
|
||||
}
|
||||
return parts.join('');
|
||||
};
|
||||
|
||||
if (typeof content === 'string') {
|
||||
const result = extractFromString(content);
|
||||
return result || undefined;
|
||||
}
|
||||
|
||||
if (!Array.isArray(content)) return undefined;
|
||||
|
||||
const parts: string[] = [];
|
||||
for (const part of content) {
|
||||
if (part.type === ContentPartType.TEXT && part.text) {
|
||||
const result = extractFromString(part.text);
|
||||
if (result) parts.push(result);
|
||||
}
|
||||
}
|
||||
return parts.length > 0 ? parts.join('') : undefined;
|
||||
}
|
||||
|
||||
// Replicate ChatService.stripReasoningContent (private static)
|
||||
function stripReasoningContent(
|
||||
content: string | Array<{ type: string; text?: string }> | null | undefined
|
||||
): typeof content {
|
||||
if (!content) return content;
|
||||
|
||||
if (typeof content === 'string') {
|
||||
return content
|
||||
.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
|
||||
.replace(AGENTIC_REGEX.REASONING_OPEN, '')
|
||||
.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
|
||||
.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '');
|
||||
}
|
||||
|
||||
if (!Array.isArray(content)) return content;
|
||||
|
||||
return content.map((part) => {
|
||||
if (part.type !== ContentPartType.TEXT || !part.text) return part;
|
||||
return {
|
||||
...part,
|
||||
text: part.text
|
||||
.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
|
||||
.replace(AGENTIC_REGEX.REASONING_OPEN, '')
|
||||
.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
|
||||
.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '')
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// Simulate the message mapping logic from ChatService.sendMessage
|
||||
function buildApiMessage(
|
||||
content: string,
|
||||
excludeReasoningFromContext: boolean
|
||||
): { role: string; content: string; reasoning_content?: string } {
|
||||
const cleaned = stripReasoningContent(content) as string;
|
||||
const mapped: { role: string; content: string; reasoning_content?: string } = {
|
||||
role: 'assistant',
|
||||
content: cleaned
|
||||
};
|
||||
if (!excludeReasoningFromContext) {
|
||||
const reasoning = extractReasoningFromContent(content);
|
||||
if (reasoning) {
|
||||
mapped.reasoning_content = reasoning;
|
||||
}
|
||||
}
|
||||
return mapped;
|
||||
}
|
||||
|
||||
// Helper: wrap reasoning the same way the chat store does during streaming
|
||||
function wrapReasoning(reasoning: string, content: string): string {
|
||||
return `${REASONING_TAGS.START}${reasoning}${REASONING_TAGS.END}${content}`;
|
||||
}
|
||||
|
||||
describe('reasoning content extraction', () => {
|
||||
it('extracts reasoning from tagged string content', () => {
|
||||
const input = wrapReasoning('step 1, step 2', 'The answer is 42.');
|
||||
const result = extractReasoningFromContent(input);
|
||||
expect(result).toBe('step 1, step 2');
|
||||
});
|
||||
|
||||
it('returns undefined when no reasoning tags present', () => {
|
||||
expect(extractReasoningFromContent('Just a normal response.')).toBeUndefined();
|
||||
});
|
||||
|
||||
it('returns undefined for null/empty input', () => {
|
||||
expect(extractReasoningFromContent(null)).toBeUndefined();
|
||||
expect(extractReasoningFromContent(undefined)).toBeUndefined();
|
||||
expect(extractReasoningFromContent('')).toBeUndefined();
|
||||
});
|
||||
|
||||
it('extracts reasoning from content part arrays', () => {
|
||||
const input = [
|
||||
{
|
||||
type: ContentPartType.TEXT,
|
||||
text: wrapReasoning('thinking hard', 'result')
|
||||
}
|
||||
];
|
||||
expect(extractReasoningFromContent(input)).toBe('thinking hard');
|
||||
});
|
||||
|
||||
it('handles multiple reasoning blocks', () => {
|
||||
const input =
|
||||
REASONING_TAGS.START +
|
||||
'block1' +
|
||||
REASONING_TAGS.END +
|
||||
'middle' +
|
||||
REASONING_TAGS.START +
|
||||
'block2' +
|
||||
REASONING_TAGS.END +
|
||||
'end';
|
||||
expect(extractReasoningFromContent(input)).toBe('block1block2');
|
||||
});
|
||||
|
||||
it('ignores non-text content parts', () => {
|
||||
const input = [{ type: 'image_url', text: wrapReasoning('hidden', 'img') }];
|
||||
expect(extractReasoningFromContent(input)).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('strip reasoning content', () => {
|
||||
it('removes reasoning tags from string content', () => {
|
||||
const input = wrapReasoning('internal thoughts', 'visible answer');
|
||||
expect(stripReasoningContent(input)).toBe('visible answer');
|
||||
});
|
||||
|
||||
it('removes reasoning from content part arrays', () => {
|
||||
const input = [
|
||||
{
|
||||
type: ContentPartType.TEXT,
|
||||
text: wrapReasoning('thoughts', 'answer')
|
||||
}
|
||||
];
|
||||
const result = stripReasoningContent(input) as Array<{ type: string; text?: string }>;
|
||||
expect(result[0].text).toBe('answer');
|
||||
});
|
||||
});
|
||||
|
||||
describe('API message building with reasoning preservation', () => {
|
||||
const storedContent = wrapReasoning('Let me think: 2+2=4, basic arithmetic.', 'The answer is 4.');
|
||||
|
||||
it('preserves reasoning_content when excludeReasoningFromContext is false', () => {
|
||||
const msg = buildApiMessage(storedContent, false);
|
||||
expect(msg.content).toBe('The answer is 4.');
|
||||
expect(msg.reasoning_content).toBe('Let me think: 2+2=4, basic arithmetic.');
|
||||
// no internal tags leak into either field
|
||||
expect(msg.content).not.toContain('<<<');
|
||||
expect(msg.reasoning_content).not.toContain('<<<');
|
||||
});
|
||||
|
||||
it('strips reasoning_content when excludeReasoningFromContext is true', () => {
|
||||
const msg = buildApiMessage(storedContent, true);
|
||||
expect(msg.content).toBe('The answer is 4.');
|
||||
expect(msg.reasoning_content).toBeUndefined();
|
||||
});
|
||||
|
||||
it('handles content with no reasoning in both modes', () => {
|
||||
const plain = 'No reasoning here.';
|
||||
const msgPreserve = buildApiMessage(plain, false);
|
||||
const msgExclude = buildApiMessage(plain, true);
|
||||
expect(msgPreserve.content).toBe(plain);
|
||||
expect(msgPreserve.reasoning_content).toBeUndefined();
|
||||
expect(msgExclude.content).toBe(plain);
|
||||
expect(msgExclude.reasoning_content).toBeUndefined();
|
||||
});
|
||||
|
||||
it('cleans agentic tool call blocks from content even when preserving reasoning', () => {
|
||||
const input =
|
||||
wrapReasoning('plan', 'text') +
|
||||
'\n\n<<<AGENTIC_TOOL_CALL_START>>>\n' +
|
||||
'<<<TOOL_NAME:bash>>>\n' +
|
||||
'<<<TOOL_ARGS_START>>>\n{}\n<<<TOOL_ARGS_END>>>\nout\n' +
|
||||
'<<<AGENTIC_TOOL_CALL_END>>>\n';
|
||||
const msg = buildApiMessage(input, false);
|
||||
expect(msg.content).not.toContain('<<<');
|
||||
expect(msg.reasoning_content).toBe('plan');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user