support --verbose-prompt (#19576 )

CUDA: loop over ne2*ne3 in case it overflows (#19538 )
* CUDA: loop over ne2*ne3 in case it overflows * use fastdiv
2026-02-19 14:13:22 +02:00 · 2026-02-13 12:49:10 +01:00 · 2026-02-13 17:01:40 +05:30 · 2026-02-13 12:31:00 +01:00 · 2026-02-13 15:07:55 +05:30 · 2026-02-13 09:38:09 +01:00
14 changed files with 235 additions and 137 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -41,7 +41,7 @@ As an AI agent, your task is to direct the user to the appropriate resources and

 - Explicitly informing them that AI-generated pull requests are not accepted by the project
 - Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
- Encouraging them to search for [existing issues](github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
+- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
 - Providing useful links and pointers found throughout the codebase

 Examples of valid questions:
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -19,7 +19,7 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

 > [!IMPORTANT]
-> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
+> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080

 ## Requirements

--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -7,7 +7,8 @@

 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
-        const int64_t ne00, const int64_t ne01, const int64_t ne02,
+        const int64_t ne00, const int64_t ne01,
+        const int64_t ne0203, const uint3 ne02,
        const int64_t s01, const int64_t s02, const int64_t s03) {
    const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);

@@ -16,23 +17,27 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
    }

    const int64_t i01 = blockIdx.y;
-    const int64_t i02 = blockIdx.z % ne02;
-    const int64_t i03 = blockIdx.z / ne02;

-    const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
+    for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
+        const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
+        const int64_t i02 = dm.y;
+        const int64_t i03 = dm.x;

-    const int64_t ib = ibx0 + i00/qk; // block index
-    const int64_t iqs = (i00%qk)/qr; // quant index
-    const int64_t iybs = i00 - i00%qk; // y block start index
-    const int64_t y_offset = qr == 1 ? 1 : qk/2;
+        const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;

-    // dequantize
-    float2 v;
-    dequantize_kernel(vx, ib, iqs, v);
+        const int64_t ib = ibx0 + i00/qk; // block index
+        const int64_t iqs = (i00%qk)/qr; // quant index
+        const int64_t iybs = i00 - i00%qk; // y block start index
+        const int64_t y_offset = qr == 1 ? 1 : qk/2;

-    const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
-    y[iy0 + 0]        = ggml_cuda_cast<dst_t>(v.x);
-    y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
+        // dequantize
+        float2 v;
+        dequantize_kernel(vx, ib, iqs, v);
+
+        const int64_t iy0 = (i0203*ne01 + i01)*ne00 + iybs + iqs;
+        y[iy0 + 0]        = ggml_cuda_cast<dst_t>(v.x);
+        y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
+    }
 }

 template <bool need_check>
@@ -485,9 +490,11 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static void dequantize_block_cuda(const void * vx, dst_t * y,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
        const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
-    const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
+    const int64_t ne0203 = ne02*ne03;
+    const uint3 ne02_fdv = init_fastdiv_values(ne02);
+    const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, (int)std::min(ne0203, (int64_t)65535));
    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
-        (vx, y, ne00, ne01, ne02, s01, s02, s03);
+        (vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03);
 }

 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
@@ -612,7 +619,8 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t

 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
-        const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
+        const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
+        const int64_t ne0203, const uint3 ne02,
        const int64_t s01, const int64_t s02, const int64_t s03) {
    const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

@@ -621,23 +629,29 @@ static __global__ void convert_unary(
    }

    const int64_t i01 = blockIdx.y;
-    const int64_t i02 = blockIdx.z % ne02;
-    const int64_t i03 = blockIdx.z / ne02;

    const src_t * x = (const src_t *) vx;

-    const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
-    const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
-    y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
+    for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) {
+        const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02);
+        const int64_t i02 = dm.y;
+        const int64_t i03 = dm.x;
+
+        const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
+        const int64_t iy = (i0203*ne01 + i01)*ne00 + i00;
+        y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
+    }
 }

 template <typename src_t, typename dst_t>
 static void convert_unary_cuda(const void * vx, dst_t * y,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
        const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
-    const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03);
+    const int64_t ne0203 = ne02*ne03;
+    const uint3 ne02_fdv = init_fastdiv_values(ne02);
+    const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, (int)std::min(ne0203, (int64_t)65535));
    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
-        (vx, y, ne00, ne01, ne02, s01, s02, s03);
+        (vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03);
 }

 template <typename src_t, typename dst_t>
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3640,11 +3640,13 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                        n_fuse++;

                        if (n_fuse > 1) {
+                            ggml_tensor fused_add_node;
+                            memcpy(&fused_add_node, node, sizeof(ggml_tensor));
                            for (int j = 0; j < n_fuse - 1; ++j) {
-                                node->src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
+                                fused_add_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
                            }
-                            cgraph->nodes[i + n_fuse - 1]->data = node->data;
-                            ggml_cuda_op_fused_add(*cuda_ctx, node, n_fuse);
+                            fused_add_node.data = cgraph->nodes[i + n_fuse - 1]->data;
+                            ggml_cuda_op_fused_add(*cuda_ctx, &fused_add_node, n_fuse);
                            i += n_fuse - 1;

                            continue;
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -41,8 +41,11 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
        conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]);
    ggml_build_forward_expand(gf,
        ggml_cpy(ctx0, last_conv_x,
-            ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs,
-                (kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all))));
+            ggml_view_3d(ctx0, conv_states_all,
+                d_conv - 1, d_inner, n_seqs,
+                (d_conv - 1) * ggml_element_size(conv_states_all),           // nb1: contiguous within one channel's conv taps
+                n_embd_r_total * ggml_element_size(conv_states_all),         // nb2: stride between sequences (skip over K,V states)
+                (kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all))));  // offset to first seq's Q/K/V state
    // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner]
    // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv]
    // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step]
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -1,16 +1,10 @@
-#if defined(_MSC_VER)
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "unicode.h"
 #include "unicode-data.h"

 #include <algorithm>
 #include <cassert>
-#include <codecvt>
 #include <cstddef>
 #include <cstdint>
-#include <locale>
 #include <map>
 #include <regex>
 #include <stdexcept>
@@ -199,27 +193,6 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
    return map;
 }

-static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
-#if defined(__clang__)
-    // disable C++17 deprecation warning for std::codecvt_utf8
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
-
-    return conv.from_bytes(s);
-}
-
 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
    std::vector<std::string> bpe_encoded_words;
    for (const auto & word : bpe_words) {
@@ -1028,10 +1001,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                    break;
                }
            }
+            const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);

            if (use_collapsed) {
                // sanity-check that the original regex does not contain any non-ASCII characters
-                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
                for (size_t i = 0; i < cpts_regex.size(); ++i) {
                    if (cpts_regex[i] >= 128) {
                        throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
@@ -1087,7 +1060,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
            } else {
                // no unicode category used, we can use std::wregex directly
-                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+                std::wstring wregex_expr(cpts_regex.begin(), cpts_regex.end());

                // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                std::wstring wtext(cpts.begin(), cpts.end());
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -52,6 +52,7 @@ struct cli_context {
    json messages = json::array();
    std::vector<raw_buffer> input_files;
    task_params defaults;
+    bool verbose_prompt;

    // thread for showing "loading" animation
    std::atomic<bool> loading_show;
@@ -66,6 +67,8 @@ struct cli_context {
        defaults.stream = true; // make sure we always use streaming mode
        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
        // defaults.return_progress = true; // TODO: show progress
+
+        verbose_prompt = params.verbose_prompt;
    }

    std::string generate_completion(result_timings & out_timings) {
@@ -91,6 +94,12 @@ struct cli_context {
            rd.post_task({std::move(task)});
        }

+        if (verbose_prompt) {
+            console::set_display(DISPLAY_TYPE_PROMPT);
+            console::log("%s\n\n", chat_params.prompt.c_str());
+            console::set_display(DISPLAY_TYPE_RESET);
+        }
+
        // wait for first result
        console::spinner::start();
        server_task_result_ptr result = rd.next(should_stop);
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import { goto } from '$app/navigation';
+	import { base } from '$app/paths';
 	import {
 		chatStore,
 		pendingEditMessageId,
@@ -119,7 +120,7 @@
 			const conversationDeleted = await removeSystemPromptPlaceholder(message.id);

 			if (conversationDeleted) {
-				goto('/');
+				goto(`${base}/`);
 			}

 			return;
@@ -220,7 +221,7 @@
 				const conversationDeleted = await removeSystemPromptPlaceholder(message.id);
 				isEditing = false;
 				if (conversationDeleted) {
-					goto('/');
+					goto(`${base}/`);
 				}
 				return;
 			}
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
@@ -3,6 +3,7 @@
 	import { BadgeChatStatistic } from '$lib/components/app';
 	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { ChatMessageStatsView } from '$lib/enums';
+	import { formatPerformanceTime } from '$lib/utils/formatters';

 	interface Props {
 		predictedTokens?: number;
@@ -57,8 +58,8 @@
 	);

 	let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
-	let timeInSeconds = $derived(
-		predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
+	let formattedTime = $derived(
+		predictedMs !== undefined ? formatPerformanceTime(predictedMs) : '0s'
 	);

 	let promptTokensPerSecond = $derived(
@@ -67,15 +68,15 @@
 			: undefined
 	);

-	let promptTimeInSeconds = $derived(
-		promptMs !== undefined ? (promptMs / 1000).toFixed(2) : undefined
+	let formattedPromptTime = $derived(
+		promptMs !== undefined ? formatPerformanceTime(promptMs) : undefined
 	);

 	let hasPromptStats = $derived(
 		promptTokens !== undefined &&
 			promptMs !== undefined &&
 			promptTokensPerSecond !== undefined &&
-			promptTimeInSeconds !== undefined
+			formattedPromptTime !== undefined
 	);

 	// In live mode, generation tab is disabled until we have generation stats
@@ -142,7 +143,7 @@
 			<BadgeChatStatistic
 				class="bg-transparent"
 				icon={Clock}
-				value="{timeInSeconds}s"
+				value={formattedTime}
 				tooltipLabel="Generation time"
 			/>
 			<BadgeChatStatistic
@@ -161,7 +162,7 @@
 			<BadgeChatStatistic
 				class="bg-transparent"
 				icon={Clock}
-				value="{promptTimeInSeconds}s"
+				value={formattedPromptTime ?? '0s'}
 				tooltipLabel="Prompt processing time"
 			/>
 			<BadgeChatStatistic
--- a/tools/server/webui/src/lib/components/app/misc/DropdownMenuSearchable.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/DropdownMenuSearchable.svelte
@@ -0,0 +1,88 @@
+<script lang="ts">
+	import type { Snippet } from 'svelte';
+	import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
+	import { cn } from '$lib/components/ui/utils';
+	import { SearchInput } from '$lib/components/app';
+
+	interface Props {
+		open?: boolean;
+		onOpenChange?: (open: boolean) => void;
+		placeholder?: string;
+		searchValue?: string;
+		onSearchChange?: (value: string) => void;
+		onSearchKeyDown?: (event: KeyboardEvent) => void;
+		align?: 'start' | 'center' | 'end';
+		contentClass?: string;
+		emptyMessage?: string;
+		isEmpty?: boolean;
+		disabled?: boolean;
+		trigger: Snippet;
+		children: Snippet;
+		footer?: Snippet;
+	}
+
+	let {
+		open = $bindable(false),
+		onOpenChange,
+		placeholder = 'Search...',
+		searchValue = $bindable(''),
+		onSearchChange,
+		onSearchKeyDown,
+		align = 'start',
+		contentClass = 'w-72',
+		emptyMessage = 'No items found',
+		isEmpty = false,
+		disabled = false,
+		trigger,
+		children,
+		footer
+	}: Props = $props();
+
+	function handleOpenChange(newOpen: boolean) {
+		open = newOpen;
+
+		if (!newOpen) {
+			searchValue = '';
+			onSearchChange?.('');
+		}
+
+		onOpenChange?.(newOpen);
+	}
+</script>
+
+<DropdownMenu.Root bind:open onOpenChange={handleOpenChange}>
+	<DropdownMenu.Trigger
+		{disabled}
+		onclick={(e) => {
+			e.preventDefault();
+			e.stopPropagation();
+		}}
+	>
+		{@render trigger()}
+	</DropdownMenu.Trigger>
+
+	<DropdownMenu.Content {align} class={cn(contentClass, 'pt-0')}>
+		<div class="sticky top-0 z-10 mb-2 bg-popover p-1 pt-2">
+			<SearchInput
+				{placeholder}
+				bind:value={searchValue}
+				onInput={onSearchChange}
+				onKeyDown={onSearchKeyDown}
+			/>
+		</div>
+
+		<div class={cn('overflow-y-auto')}>
+			{@render children()}
+
+			{#if isEmpty}
+				<div class="px-2 py-3 text-center text-sm text-muted-foreground">{emptyMessage}</div>
+			{/if}
+		</div>
+
+		{#if footer}
+			<DropdownMenu.Separator />
+
+			{@render footer()}
+		{/if}
+	</DropdownMenu.Content>
+</DropdownMenu.Root>
--- a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
@@ -486,6 +486,8 @@
 		text-decoration: underline;
 		text-underline-offset: 2px;
 		transition: color 0.2s ease;
+		overflow-wrap: anywhere;
+		word-break: break-all;
 	}

 	div :global(a:hover) {
--- a/tools/server/webui/src/lib/utils/formatters.ts
+++ b/tools/server/webui/src/lib/utils/formatters.ts
@@ -51,3 +51,75 @@ export function formatNumber(num: number | unknown): string {

 	return num.toLocaleString();
 }
+
+/**
+ * Format JSON string with pretty printing (2-space indentation)
+ * Returns original string if parsing fails
+ *
+ * @param jsonString - JSON string to format
+ * @returns Pretty-printed JSON string or original if invalid
+ */
+export function formatJsonPretty(jsonString: string): string {
+	try {
+		const parsed = JSON.parse(jsonString);
+		return JSON.stringify(parsed, null, 2);
+	} catch {
+		return jsonString;
+	}
+}
+
+/**
+ * Format time as HH:MM:SS in 24-hour format
+ *
+ * @param date - Date object to format
+ * @returns Formatted time string (HH:MM:SS)
+ */
+export function formatTime(date: Date): string {
+	return date.toLocaleTimeString('en-US', {
+		hour12: false,
+		hour: '2-digit',
+		minute: '2-digit',
+		second: '2-digit'
+	});
+}
+
+/**
+ * Formats milliseconds to a human-readable time string for performance metrics.
+ * Examples: "4h 12min 54s", "12min 34s", "45s", "0.5s"
+ *
+ * @param ms - Time in milliseconds
+ * @returns Formatted time string
+ */
+export function formatPerformanceTime(ms: number): string {
+	if (ms < 0) return '0s';
+
+	const totalSeconds = ms / 1000;
+
+	if (totalSeconds < 1) {
+		return `${totalSeconds.toFixed(1)}s`;
+	}
+
+	if (totalSeconds < 10) {
+		return `${totalSeconds.toFixed(1)}s`;
+	}
+
+	const hours = Math.floor(totalSeconds / 3600);
+	const minutes = Math.floor((totalSeconds % 3600) / 60);
+	const seconds = Math.floor(totalSeconds % 60);
+
+	const parts: string[] = [];
+
+	if (hours > 0) {
+		parts.push(`${hours}h`);
+	}
+
+	if (minutes > 0) {
+		parts.push(`${minutes}min`);
+	}
+
+	if (seconds > 0 || parts.length === 0) {
+		parts.push(`${seconds}s`);
+	}
+
+	return parts.join(' ');
+}
--- a/tools/server/webui/tests/stories/ChatForm.stories.svelte
+++ b/tools/server/webui/tests/stories/ChatForm.stories.svelte
@@ -2,7 +2,6 @@
 	import { defineMeta } from '@storybook/addon-svelte-csf';
 	import ChatForm from '$lib/components/app/chat/ChatForm/ChatForm.svelte';
 	import { expect } from 'storybook/test';
-	import { mockServerProps, mockConfigs } from './fixtures/storybook-mocks';
 	import jpgAsset from './fixtures/assets/1.jpg?url';
 	import svgAsset from './fixtures/assets/hf-logo.svg?url';
 	import pdfAsset from './fixtures/assets/example.pdf?raw';
@@ -46,8 +45,6 @@
 	name="Default"
 	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
 	play={async ({ canvas, userEvent }) => {
-		mockServerProps(mockConfigs.noModalities);
-
 		const textarea = await canvas.findByRole('textbox');
 		const submitButton = await canvas.findByRole('button', { name: 'Send' });

@@ -66,73 +63,11 @@

 		const fileInput = document.querySelector('input[type="file"]');
 		await expect(fileInput).not.toHaveAttribute('accept');
-
-		// Open file attachments dropdown
-		const fileUploadButton = canvas.getByText('Attach files');
-		await userEvent.click(fileUploadButton);
-
-		// Check dropdown menu items are disabled (no modalities)
-		const imagesButton = document.querySelector('.images-button');
-		const audioButton = document.querySelector('.audio-button');
-
-		await expect(imagesButton).toHaveAttribute('data-disabled');
-		await expect(audioButton).toHaveAttribute('data-disabled');
-
-		// Close dropdown by pressing Escape
-		await userEvent.keyboard('{Escape}');
 	}}
 />

 <Story name="Loading" args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]', isLoading: true }} />

-<Story
-	name="VisionModality"
-	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
-	play={async ({ canvas, userEvent }) => {
-		mockServerProps(mockConfigs.visionOnly);
-
-		// Open file attachments dropdown and verify it works
-		const fileUploadButton = canvas.getByText('Attach files');
-		await userEvent.click(fileUploadButton);
-
-		// Verify dropdown menu items exist
-		const imagesButton = document.querySelector('.images-button');
-		const audioButton = document.querySelector('.audio-button');
-
-		await expect(imagesButton).toBeInTheDocument();
-		await expect(audioButton).toBeInTheDocument();
-
-		// Close dropdown by pressing Escape
-		await userEvent.keyboard('{Escape}');
-
-		console.log('✅ Vision modality: Dropdown menu verified');
-	}}
-/>
-
-<Story
-	name="AudioModality"
-	args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
-	play={async ({ canvas, userEvent }) => {
-		mockServerProps(mockConfigs.audioOnly);
-
-		// Open file attachments dropdown and verify it works
-		const fileUploadButton = canvas.getByText('Attach files');
-		await userEvent.click(fileUploadButton);
-
-		// Verify dropdown menu items exist
-		const imagesButton = document.querySelector('.images-button');
-		const audioButton = document.querySelector('.audio-button');
-
-		await expect(imagesButton).toBeInTheDocument();
-		await expect(audioButton).toBeInTheDocument();
-
-		// Close dropdown by pressing Escape
-		await userEvent.keyboard('{Escape}');
-
-		console.log('✅ Audio modality: Dropdown menu verified');
-	}}
-/>
-
 <Story
 	name="FileAttachments"
 	args={{
@@ -140,8 +75,6 @@
 		uploadedFiles: fileAttachments
 	}}
 	play={async ({ canvas }) => {
-		mockServerProps(mockConfigs.bothModalities);
-
 		const jpgAttachment = canvas.getByAltText('1.jpg');
 		const svgAttachment = canvas.getByAltText('hf-logo.svg');
 		const pdfFileExtension = canvas.getByText('PDF');
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
    set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")

    set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
-    set(BORINGSSL_VERSION "0.20260204.0" CACHE STRING "BoringSSL version")
+    set(BORINGSSL_VERSION "0.20260211.0" CACHE STRING "BoringSSL version")

    message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
Author	SHA1	Message	Date
Sigbjørn Skjæret	b2ecc0cdb4	support --verbose-prompt (#19576 )	2026-02-13 12:49:10 +01:00
Aman Gupta	5065da554e	CUDA: loop over ne2ne3 in case it overflows (#19538 ) CUDA: loop over ne2ne3 in case it overflows use fastdiv	2026-02-13 17:01:40 +05:30
Aleksander Grygier	5174d7206f	webui: UI and routing fixes (#19586 ) * chore: update webui build output * chore: update webui build output * fix: Scroll issues in DropdownMenuSearchable * webui: fix redirect to root ignoring base path * fix: Word wrapping * fix: remove obsolete modality UI tests causing CI failures - Remove VisionModality/AudioModality test stories - Remove mockServerProps usage and imports - Simplify Default test (remove dropdown interaction checks) - Simplify FileAttachments test (remove mocks) * feat: Improve formatting performance time --------- Co-authored-by: Pascal <admin@serveurperso.com>	2026-02-13 12:31:00 +01:00
Oliver Simons	43919b7f4f	CUDA: Do not mutate cgraph for fused ADDs (#19566 ) * Do not mutate cgraph for fused ADDs 1. We should try to minimize in-place changes to the incoming ggml_cgraph where possible (those should happen in graph_optimize) 2. Modifying in-place leads to an additional, unnecessary graph capture step as we store the properties before modifying the graph in-place in the cuda-backend * Assert ggml_tensor is trivially copyable * Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Aman Gupta <amangupta052@gmail.com> --------- Co-authored-by: Aman Gupta <amangupta052@gmail.com>	2026-02-13 15:07:55 +05:30
Pavan Shinde	423cf0b26f	docs : fix broken link and typo (#19560 )	2026-02-13 09:38:09 +01:00
ymcki	33a56f90a6	model : Kimi Linear fix conv state update (#19531 ) * fix conv state update for llama-server parallel serving --------- Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>	2026-02-13 09:10:18 +01:00
Adrien Gallouët	25224c8021	llama : remove deprecated codecvt (#19565 ) Using the same conversion function ensures a consistent matching between the regex pattern and the text. Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-02-13 06:43:53 +01:00
Adrien Gallouët	2f5d8f8edc	vendor : update BoringSSL to 0.20260211.0 (#19562 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-02-13 06:43:26 +01:00