Compare commits

...

4 Commits
b8579 ... b8583

Author SHA1 Message Date
Aman Gupta
278521c33a llama-model-loader: print warning when using overrides with mmap (#20978)
* llama-model-loader: use pinned memory for tensor overrides

* change to warning
2026-03-30 17:40:17 +08:00
Sigbjørn Skjæret
e2eb39e81c ci : bump ty to 0.0.26 (#21156)
* fix incorrect type ignore comments

* bump ty to 0.0.26
2026-03-30 09:29:15 +02:00
Xuan-Son Nguyen
abf9a62161 server: wrap headers for mcp proxy (#21072)
* server: wrap headers for mcp proxy

* Update tools/server/server-cors-proxy.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix build

* chore: update webui build output

* chore: update webui build output

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
2026-03-30 08:59:16 +02:00
Sigbjørn Skjæret
7c203670f8 add missing ROPE_FACTORS_LONG/SHORT for MiniCPM (#21150) 2026-03-29 19:45:40 +02:00
17 changed files with 59 additions and 21 deletions

View File

@@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.24
pip-install: -r requirements/requirements-all.txt ty==0.0.26
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:

View File

@@ -31,10 +31,10 @@ import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab
try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
SentencePieceTokenizer,
)

View File

@@ -7,7 +7,7 @@ import os
# Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE"""

View File

@@ -5,7 +5,7 @@ import sys
import os
import argparse
from pathlib import Path
from common import get_model_name_from_env_path # type: ignore[import-not-found]
from common import get_model_name_from_env_path # type: ignore[import-not-found, ty:unresolved-import]
def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2)

View File

@@ -2,7 +2,7 @@
import argparse
import sys
from common import compare_tokens # type: ignore[import-not-found]
from common import compare_tokens # type: ignore[import-not-found, ty:unresolved-import]
def parse_arguments():

View File

@@ -7,7 +7,7 @@ import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found, ty:unresolved-import]
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

View File

@@ -14,12 +14,12 @@ except ImportError:
SentencePieceProcessor: Any = None
try:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
_filter_valid_tokenizer_files,
)
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
SentencePieceTokenizer,
)
except ImportError:
@@ -32,7 +32,7 @@ else:
_mistral_common_installed = True
try:
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
get_one_valid_tokenizer_file,
)
except ImportError:

View File

@@ -147,7 +147,7 @@ ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd
for codepoint, norm in table_nfd:
start = ranges_nfd[-1][0]
if ranges_nfd[-1] != (start, codepoint - 1, norm):
ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below
ranges_nfd.append((0, 0, 0)) # dummy, will be replaced below
start = codepoint
ranges_nfd[-1] = (start, codepoint, norm)

View File

@@ -557,6 +557,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_ROPE_FREQS,
LLM_TENSOR_ROPE_FACTORS_LONG,
LLM_TENSOR_ROPE_FACTORS_SHORT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,

View File

@@ -1158,6 +1158,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
// when overriding to a CPU buffer, consider the extra buffer types
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
if (use_mmap) {
static std::once_flag once;
std::call_once(once, [] {
LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
});
}
} else {
buft = overrides->buft;
}

Binary file not shown.

View File

@@ -32,13 +32,22 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
std::map<std::string, std::string> headers;
for (auto [key, value] : req.headers) {
auto new_key = key;
if (string_starts_with(new_key, "X-Proxy-Header-")) {
string_replace_all(new_key, "X-Proxy-Header-", "");
}
headers[new_key] = value;
}
auto proxy = std::make_unique<server_http_proxy>(
method,
parsed_url.scheme,
parsed_url.host,
parsed_url.port,
parsed_url.path,
req.headers,
headers,
req.body,
req.should_stop,
600, // timeout_read (default to 10 minutes)

View File

@@ -35,7 +35,7 @@ using server_http_res_ptr = std::unique_ptr<server_http_res>;
struct server_http_req {
std::map<std::string, std::string> params; // path_params + query_params
std::map<std::string, std::string> headers; // reserved for future use
std::map<std::string, std::string> headers; // used by MCP proxy
std::string path;
std::string query_string; // query parameters string (e.g. "action=save")
std::string body;

View File

@@ -116,7 +116,7 @@ class ServerProcess:
self.server_port = int(os.environ["PORT"])
self.external_server = "DEBUG_EXTERNAL" in os.environ
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
if self.external_server:
print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
return

View File

@@ -39,7 +39,13 @@ import type {
MCPResourceContent,
MCPReadResourceResult
} from '$lib/types';
import { buildProxiedUrl, throwIfAborted, isAbortError, createBase64DataUrl } from '$lib/utils';
import {
buildProxiedUrl,
buildProxiedHeaders,
throwIfAborted,
isAbortError,
createBase64DataUrl
} from '$lib/utils';
interface ToolResultContentItem {
type: string;
@@ -118,7 +124,7 @@ export class MCPService {
const requestInit: RequestInit = {};
if (config.headers) {
requestInit.headers = config.headers;
requestInit.headers = buildProxiedHeaders(config.headers);
}
if (config.credentials) {

View File

@@ -19,6 +19,21 @@ export function buildProxiedUrl(targetUrl: string): URL {
return proxyUrl;
}
/**
* Wrap original headers for proxying through the CORS proxy. This avoids issues with duplicated llama.cpp-specific and target headers when using the CORS proxy.
* @param headers - The original headers to be proxied to target
* @returns List of "wrapped" headers to be sent to the CORS proxy
*/
export function buildProxiedHeaders(headers: Record<string, string>): Record<string, string> {
const proxiedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
proxiedHeaders[`X-Proxy-Header-${key}`] = value;
}
return proxiedHeaders;
}
/**
* Get a proxied URL string for use in fetch requests.
* @param targetUrl - The original URL to proxy

View File

@@ -38,7 +38,7 @@ export { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } fr
export { setConfigValue, getConfigValue, configToParameterRecord } from './config-helpers';
// CORS Proxy
export { buildProxiedUrl, getProxiedUrlString } from './cors-proxy';
export { buildProxiedUrl, getProxiedUrlString, buildProxiedHeaders } from './cors-proxy';
// Conversation utilities
export { createMessageCountMap, getMessageCount } from './conversation-utils';