llama-model-loader: print warning when using overrides with mmap (#20978 )

* llama-model-loader: use pinned memory for tensor overrides * change to warning
ci : bump ty to 0.0.26 (#21156 )
2026-04-30 16:47:31 +03:00 · 2026-03-30 17:40:17 +08:00 · 2026-03-30 09:29:15 +02:00 · 2026-03-30 08:59:16 +02:00 · 2026-03-29 19:45:40 +02:00
17 changed files with 59 additions and 21 deletions
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.24
+          pip-install: -r requirements/requirements-all.txt ty==0.0.26
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -31,10 +31,10 @@ import gguf
 from gguf.vocab import MistralTokenizerType, MistralVocab

 try:
-    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
        SentencePieceTokenizer,
    )

--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -7,7 +7,7 @@ import os

 # Add utils directory to path for direct script execution
 sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
-from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found]
+from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found, ty:unresolved-import]

 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@@ -5,7 +5,7 @@ import sys
 import os
 import argparse
 from pathlib import Path
-from common import get_model_name_from_env_path  # type: ignore[import-not-found]
+from common import get_model_name_from_env_path  # type: ignore[import-not-found, ty:unresolved-import]

 def calculate_nmse(reference, test):
    mse = np.mean((test - reference) ** 2)
--- a/examples/model-conversion/scripts/utils/compare_tokens.py
+++ b/examples/model-conversion/scripts/utils/compare_tokens.py
@@ -2,7 +2,7 @@

 import argparse
 import sys
-from common import compare_tokens  # type: ignore[import-not-found]
+from common import compare_tokens  # type: ignore[import-not-found, ty:unresolved-import]


 def parse_arguments():
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -7,7 +7,7 @@ import importlib
 from pathlib import Path

 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
-from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found]
+from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found, ty:unresolved-import]

 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -14,12 +14,12 @@ except ImportError:
    SentencePieceProcessor: Any = None

 try:
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
        _filter_valid_tokenizer_files,
    )
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
        SentencePieceTokenizer,
    )
 except ImportError:
@@ -32,7 +32,7 @@ else:
    _mistral_common_installed = True

 try:
-    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found, ty:unresolved-import]
        get_one_valid_tokenizer_file,
    )
 except ImportError:
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -147,7 +147,7 @@ ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)]  # start, last, nfd
 for codepoint, norm in table_nfd:
    start = ranges_nfd[-1][0]
    if ranges_nfd[-1] != (start, codepoint - 1, norm):
-        ranges_nfd.append(None)  # type: ignore[arg-type]  # dummy, will be replaced below
+        ranges_nfd.append((0, 0, 0))  # dummy, will be replaced below
        start = codepoint
    ranges_nfd[-1] = (start, codepoint, norm)

--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -557,6 +557,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_OUTPUT_NORM,
                LLM_TENSOR_OUTPUT,
                LLM_TENSOR_ROPE_FREQS,
+                LLM_TENSOR_ROPE_FACTORS_LONG,
+                LLM_TENSOR_ROPE_FACTORS_SHORT,
                LLM_TENSOR_ATTN_NORM,
                LLM_TENSOR_ATTN_Q,
                LLM_TENSOR_ATTN_K,
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1158,6 +1158,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
                    if (overrides->buft == ggml_backend_cpu_buffer_type()) {
                        // when overriding to a CPU buffer, consider the extra buffer types
                        buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
+                        if (use_mmap) {
+                            static std::once_flag once;
+                            std::call_once(once, [] {
+                                LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
+                            });
+                        }
                    } else {
                        buft = overrides->buft;
                    }
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-cors-proxy.h
+++ b/tools/server/server-cors-proxy.h
@@ -32,13 +32,22 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin

    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());

+    std::map<std::string, std::string> headers;
+    for (auto [key, value] : req.headers) {
+        auto new_key = key;
+        if (string_starts_with(new_key, "X-Proxy-Header-")) {
+            string_replace_all(new_key, "X-Proxy-Header-", "");
+        }
+        headers[new_key] = value;
+    }
+
    auto proxy = std::make_unique<server_http_proxy>(
            method,
            parsed_url.scheme,
            parsed_url.host,
            parsed_url.port,
            parsed_url.path,
-            req.headers,
+            headers,
            req.body,
            req.should_stop,
            600, // timeout_read (default to 10 minutes)
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -35,7 +35,7 @@ using server_http_res_ptr = std::unique_ptr<server_http_res>;

 struct server_http_req {
    std::map<std::string, std::string> params; // path_params + query_params
-    std::map<std::string, std::string> headers; // reserved for future use
+    std::map<std::string, std::string> headers; // used by MCP proxy
    std::string path;
    std::string query_string; // query parameters string (e.g. "action=save")
    std::string body;
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -116,7 +116,7 @@ class ServerProcess:
            self.server_port = int(os.environ["PORT"])
        self.external_server = "DEBUG_EXTERNAL" in os.environ

-    def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
+    def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
        if self.external_server:
            print(f"[external_server]: Assuming external server running on {self.server_host}:{self.server_port}")
            return
--- a/tools/server/webui/src/lib/services/mcp.service.ts
+++ b/tools/server/webui/src/lib/services/mcp.service.ts
@@ -39,7 +39,13 @@ import type {
 	MCPResourceContent,
 	MCPReadResourceResult
 } from '$lib/types';
-import { buildProxiedUrl, throwIfAborted, isAbortError, createBase64DataUrl } from '$lib/utils';
+import {
+	buildProxiedUrl,
+	buildProxiedHeaders,
+	throwIfAborted,
+	isAbortError,
+	createBase64DataUrl
+} from '$lib/utils';

 interface ToolResultContentItem {
 	type: string;
@@ -118,7 +124,7 @@ export class MCPService {
 		const requestInit: RequestInit = {};

 		if (config.headers) {
-			requestInit.headers = config.headers;
+			requestInit.headers = buildProxiedHeaders(config.headers);
 		}

 		if (config.credentials) {
--- a/tools/server/webui/src/lib/utils/cors-proxy.ts
+++ b/tools/server/webui/src/lib/utils/cors-proxy.ts
@@ -19,6 +19,21 @@ export function buildProxiedUrl(targetUrl: string): URL {
 	return proxyUrl;
 }

+/**
+ * Wrap original headers for proxying through the CORS proxy. This avoids issues with duplicated llama.cpp-specific and target headers when using the CORS proxy.
+ * @param headers - The original headers to be proxied to target
+ * @returns List of "wrapped" headers to be sent to the CORS proxy
+ */
+export function buildProxiedHeaders(headers: Record<string, string>): Record<string, string> {
+	const proxiedHeaders: Record<string, string> = {};
+
+	for (const [key, value] of Object.entries(headers)) {
+		proxiedHeaders[`X-Proxy-Header-${key}`] = value;
+	}
+
+	return proxiedHeaders;
+}
+
 /**
 * Get a proxied URL string for use in fetch requests.
 * @param targetUrl - The original URL to proxy
--- a/tools/server/webui/src/lib/utils/index.ts
+++ b/tools/server/webui/src/lib/utils/index.ts
@@ -38,7 +38,7 @@ export { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } fr
 export { setConfigValue, getConfigValue, configToParameterRecord } from './config-helpers';

 // CORS Proxy
-export { buildProxiedUrl, getProxiedUrlString } from './cors-proxy';
+export { buildProxiedUrl, getProxiedUrlString, buildProxiedHeaders } from './cors-proxy';

 // Conversation utilities
 export { createMessageCountMap, getMessageCount } from './conversation-utils';
Author	SHA1	Message	Date
Aman Gupta	278521c33a	llama-model-loader: print warning when using overrides with mmap (#20978 ) * llama-model-loader: use pinned memory for tensor overrides * change to warning	2026-03-30 17:40:17 +08:00
Sigbjørn Skjæret	e2eb39e81c	ci : bump ty to 0.0.26 (#21156 ) * fix incorrect type ignore comments * bump ty to 0.0.26	2026-03-30 09:29:15 +02:00
Xuan-Son Nguyen	abf9a62161	server: wrap headers for mcp proxy (#21072 ) * server: wrap headers for mcp proxy * Update tools/server/server-cors-proxy.h Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix build * chore: update webui build output * chore: update webui build output --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>	2026-03-30 08:59:16 +02:00
Sigbjørn Skjæret	7c203670f8	add missing ROPE_FACTORS_LONG/SHORT for MiniCPM (#21150 )	2026-03-29 19:45:40 +02:00