mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-21 17:17:24 +03:00
Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d14ce3dab4 | ||
|
|
6db130445d | ||
|
|
4b262ab662 | ||
|
|
00c461ce1a | ||
|
|
ccee426426 | ||
|
|
3c81c8deea | ||
|
|
cd963fee6a | ||
|
|
d2e179a477 | ||
|
|
c85a242ed0 | ||
|
|
aabee047d8 | ||
|
|
f1c1c5c057 | ||
|
|
439f1b193d | ||
|
|
c3e9ade6dd | ||
|
|
9a532ae4ba | ||
|
|
b7340443d4 | ||
|
|
5cbaa5e69e | ||
|
|
45b455e66f | ||
|
|
3a9c1b854d | ||
|
|
b9a2170fce | ||
|
|
1ff0fc1384 | ||
|
|
a135ec0baa | ||
|
|
232f466583 | ||
|
|
49c21f97cd | ||
|
|
77e38d68f2 | ||
|
|
053e01dff6 | ||
|
|
c3f95c1f06 | ||
|
|
0caf2a1d48 | ||
|
|
5511965b19 | ||
|
|
e98bcfec28 | ||
|
|
1867a0c692 | ||
|
|
dd7cad7197 | ||
|
|
726704a160 | ||
|
|
87589042ca | ||
|
|
e0de4c2419 | ||
|
|
84c678242a | ||
|
|
3e12fbdea5 | ||
|
|
39cf5d6191 | ||
|
|
a6d6183dbc | ||
|
|
fcae601e44 | ||
|
|
7ba22c6a09 | ||
|
|
f4cc787b9f | ||
|
|
3fbadb06dc | ||
|
|
1a68ec9378 | ||
|
|
a16cce81d3 | ||
|
|
4f13cb7424 | ||
|
|
b64739ea39 | ||
|
|
64b38b561b | ||
|
|
6049906133 | ||
|
|
0253fb21f5 |
@@ -5,6 +5,9 @@
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CHIP_TYPE=910b
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
@@ -67,6 +70,19 @@ RUN mkdir -p /app/full && \
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
# -- Install runtime dependencies --
|
||||
RUN yum install -y libgomp curl && \
|
||||
yum clean all && \
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
@@ -35,6 +38,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
||||
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
@@ -40,6 +44,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
## Build Image
|
||||
|
||||
@@ -40,6 +43,19 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
ARG IGC_VERSION=v2.20.5
|
||||
ARG IGC_VERSION_FULL=2_2.20.5+19972
|
||||
ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
@@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \
|
||||
|
||||
# TODO: use image with NNRT
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
@@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
@@ -45,6 +49,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
## Build Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS build
|
||||
|
||||
@@ -88,6 +92,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
# Pass proxy args to runtime stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
|
||||
|
||||
@@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1
|
||||
# Target the ROCm build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
### Build image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
@@ -57,6 +61,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
ARG GCC_VERSION=15.2.0
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
### Build Llama.cpp stage
|
||||
FROM gcc:${GCC_VERSION} AS build
|
||||
@@ -52,6 +55,19 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
### Base image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG UBUNTU_VERSION=26.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
@@ -31,6 +34,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
||||
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
|
||||
|
||||
83
.github/workflows/docker.yml
vendored
83
.github/workflows/docker.yml
vendored
@@ -11,6 +11,11 @@ name: Publish Docker image
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
skip_s390x:
|
||||
description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)"
|
||||
type: boolean
|
||||
default: false
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because it is expensive
|
||||
- cron: '12 4 * * *'
|
||||
@@ -64,6 +69,8 @@ jobs:
|
||||
- name: Generate build and merge matrices
|
||||
id: matrices
|
||||
shell: bash
|
||||
env:
|
||||
SKIP_S390X: ${{ inputs.skip_s390x || 'false' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
@@ -86,6 +93,11 @@ jobs:
|
||||
]
|
||||
JSON
|
||||
|
||||
if [ "${SKIP_S390X}" = "true" ]; then
|
||||
jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp
|
||||
mv build-matrix.json.tmp build-matrix.json
|
||||
fi
|
||||
|
||||
BUILD_MATRIX="$(jq -c . build-matrix.json)"
|
||||
MERGE_MATRIX="$(jq -c '
|
||||
reduce .[] as $entry ({}; .[$entry.tag] |= (
|
||||
@@ -132,6 +144,7 @@ jobs:
|
||||
config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
@@ -187,6 +200,10 @@ jobs:
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Get build date
|
||||
id: build_date
|
||||
run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
if: ${{ matrix.config.free_disk_space == true }}
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
@@ -211,13 +228,26 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: full
|
||||
provenance: false
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.build_date.outputs.date }}
|
||||
APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
|
||||
APP_REVISION=${{ steps.checkout.outputs.commit }}
|
||||
IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
|
||||
IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
annotations: |
|
||||
manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
|
||||
manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
|
||||
manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
|
||||
manifest:org.opencontainers.image.title=llama.cpp
|
||||
manifest:org.opencontainers.image.description=LLM inference in C/C++
|
||||
manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
|
||||
manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -235,13 +265,26 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: light
|
||||
provenance: false
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.build_date.outputs.date }}
|
||||
APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
|
||||
APP_REVISION=${{ steps.checkout.outputs.commit }}
|
||||
IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
|
||||
IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
annotations: |
|
||||
manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
|
||||
manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
|
||||
manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
|
||||
manifest:org.opencontainers.image.title=llama.cpp
|
||||
manifest:org.opencontainers.image.description=LLM inference in C/C++
|
||||
manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
|
||||
manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -259,13 +302,26 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: server
|
||||
provenance: false
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.build_date.outputs.date }}
|
||||
APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
|
||||
APP_REVISION=${{ steps.checkout.outputs.commit }}
|
||||
IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
|
||||
IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
annotations: |
|
||||
manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
|
||||
manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
|
||||
manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
|
||||
manifest:org.opencontainers.image.title=llama.cpp
|
||||
manifest:org.opencontainers.image.description=LLM inference in C/C++
|
||||
manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
|
||||
manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -330,10 +386,15 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build date
|
||||
id: build_date
|
||||
run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Download digest metadata
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
|
||||
with:
|
||||
@@ -361,6 +422,8 @@ jobs:
|
||||
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
|
||||
PREFIX="${IMAGE_REPO}:"
|
||||
SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
|
||||
BUILD_DATE="${{ steps.build_date.outputs.date }}"
|
||||
COMMIT_SHA="${{ steps.checkout.outputs.commit }}"
|
||||
TAGS="${{ matrix.config.tag }}"
|
||||
ARCHES="${{ matrix.config.arches }}"
|
||||
DIGEST_GLOB="/tmp/digests/*.tsv"
|
||||
@@ -412,11 +475,21 @@ jobs:
|
||||
refs+=("${IMAGE_REPO}@${digest}")
|
||||
done
|
||||
|
||||
local annotations=(
|
||||
--annotation "index:org.opencontainers.image.created=${BUILD_DATE}"
|
||||
--annotation "index:org.opencontainers.image.version=${SRC_TAG}"
|
||||
--annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}"
|
||||
--annotation "index:org.opencontainers.image.title=llama.cpp"
|
||||
--annotation "index:org.opencontainers.image.description=LLM inference in C/C++"
|
||||
--annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}"
|
||||
--annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}"
|
||||
)
|
||||
|
||||
echo "Creating ${merged_tag} from ${refs[*]}"
|
||||
docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
|
||||
docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}"
|
||||
|
||||
echo "Creating ${merged_versioned_tag} from ${refs[*]}"
|
||||
docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
|
||||
docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}"
|
||||
}
|
||||
|
||||
for tag in $TAGS; do
|
||||
|
||||
65
.github/workflows/server-self-hosted.yml
vendored
65
.github/workflows/server-self-hosted.yml
vendored
@@ -130,3 +130,68 @@ jobs:
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
name: server-kleidiai (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
|
||||
extra_args: ""
|
||||
wf_name: "CPUx1, kleidiai"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
libssl-dev \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
4
.github/workflows/ui-ci.yml
vendored
4
.github/workflows/ui-ci.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-24.04-arm
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -93,7 +93,7 @@ jobs:
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-24.04-arm
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
@@ -22,6 +22,8 @@ Pull requests (PRs):
|
||||
Commits:
|
||||
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
|
||||
- Do not explicitly set the git author in commits - rely on the default git config
|
||||
- Always use `--no-gpg-sign` when committing
|
||||
- Never `git push` without explicit confirmation from the user
|
||||
|
||||
Resources (read on demand):
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
|
||||
@@ -108,20 +108,15 @@ option(LLAMA_BUILD_TESTS "llama: build tests"
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
# Deprecated: use LLAMA_BUILD_UI instead (kept for backward compat)
|
||||
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server (deprecated: use LLAMA_BUILD_UI)" ON)
|
||||
option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket when available (deprecated: use LLAMA_USE_PREBUILT_UI)" ON)
|
||||
|
||||
# New option names
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
# Backward compat: when old var is set but new one isn't, forward the value
|
||||
if(DEFINED LLAMA_BUILD_WEBUI AND NOT DEFINED LLAMA_BUILD_UI)
|
||||
if(DEFINED LLAMA_BUILD_WEBUI)
|
||||
set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
|
||||
message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
|
||||
endif()
|
||||
if(DEFINED LLAMA_USE_PREBUILT_WEBUI AND NOT DEFINED LLAMA_USE_PREBUILT_UI)
|
||||
if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
|
||||
set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
|
||||
message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
|
||||
endif()
|
||||
@@ -286,18 +281,6 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
||||
|
||||
install(
|
||||
FILES convert_hf_to_gguf.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
OWNER_EXECUTE
|
||||
GROUP_READ
|
||||
GROUP_EXECUTE
|
||||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
configure_file(cmake/llama.pc.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
@ONLY)
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
/common/fit.* @JohannesGaessler
|
||||
/common/jinja/ @CISC
|
||||
/common/ngram-map.* @srogmann
|
||||
/conversion/ @CISC
|
||||
/convert_*.py @CISC
|
||||
/docs/backend/snapdragon/ @ggml-org/ggml-hexagon
|
||||
/examples/batched.swift/ @ggerganov
|
||||
|
||||
@@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [Metal](docs/build.md#metal-build) | Apple Silicon |
|
||||
| [BLAS](docs/build.md#blas-build) | All |
|
||||
| [BLIS](docs/backend/BLIS.md) | All |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel GPU |
|
||||
| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||
|
||||
@@ -117,6 +117,12 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
|
||||
MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
|
||||
MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
|
||||
if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DSPIRV-Headers_DIR=${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Build shared libs on Windows
|
||||
|
||||
@@ -7,7 +7,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
set(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
#include "hf-cache.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
@@ -537,7 +536,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
const bool skip = (arg == "--spec-type");
|
||||
|
||||
if (!skip) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
}
|
||||
auto & tmp = arg_to_options[arg];
|
||||
auto opt = *tmp.first;
|
||||
@@ -586,12 +589,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
// parse the first time to get -hf option (used for remote preset)
|
||||
parse_cli_args();
|
||||
|
||||
// TODO: Remove later
|
||||
try {
|
||||
hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("HF cache migration failed: %s\n", e.what());
|
||||
}
|
||||
// export_graph_ops loads only metadata
|
||||
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
|
||||
|
||||
@@ -900,7 +897,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
const bool skip = (arg == "--spec-type");
|
||||
|
||||
if (!skip) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
}
|
||||
auto opt = *arg_to_options[arg];
|
||||
std::string val;
|
||||
@@ -2808,7 +2809,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, int value) {
|
||||
params.embd_normalize = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--embd-output-format"}, "FORMAT",
|
||||
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
||||
@@ -4124,6 +4125,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.ngram_mod.n_match = 24;
|
||||
params.speculative.ngram_mod.n_min = 48;
|
||||
params.speculative.ngram_mod.n_max = 64;
|
||||
|
||||
// TODO: not sure if this is a good config - explore more settings and potentially enable it
|
||||
//params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
|
||||
//params.speculative.ngram_map_k4v.size_n = 8;
|
||||
//params.speculative.ngram_map_k4v.size_m = 24;
|
||||
//params.speculative.ngram_map_k4v.min_hits = 2;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
|
||||
@@ -43,11 +43,33 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
||||
const autoparser & autoparser) {
|
||||
// Create the result structure
|
||||
common_chat_params data;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = autoparser.preserved_tokens;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = autoparser.preserved_tokens;
|
||||
|
||||
auto parser = autoparser.build_parser(inputs);
|
||||
std::string parser_generation_prompt = data.generation_prompt;
|
||||
|
||||
if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !inputs.continue_msg.empty()) {
|
||||
// Build up generation prompt manually
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
if (!autoparser.reasoning.start.empty()) {
|
||||
data.generation_prompt = data.generation_prompt.substr(0, data.generation_prompt.find(autoparser.reasoning.start));
|
||||
data.generation_prompt += autoparser.reasoning.start + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += autoparser.reasoning.end;
|
||||
}
|
||||
}
|
||||
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = autoparser.build_parser(inputs, parser_generation_prompt);
|
||||
data.parser = parser.save();
|
||||
|
||||
// Build grammar if tools are present
|
||||
@@ -87,7 +109,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
||||
return data;
|
||||
}
|
||||
|
||||
common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
|
||||
common_peg_arena autoparser::build_parser(const generation_params & inputs, const std::string & generation_prompt) const {
|
||||
if (!analysis_complete) {
|
||||
throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
|
||||
}
|
||||
@@ -121,7 +143,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
|
||||
} else {
|
||||
parser = content.build_parser(ctx);
|
||||
}
|
||||
return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
|
||||
return pure_content ? p.prefix(generation_prompt, reasoning.start) + parser : p.prefix(generation_prompt, reasoning.start) << parser;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -60,16 +60,21 @@ struct generation_params {
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
bool stream = true;
|
||||
std::string grammar;
|
||||
bool add_generation_prompt = false;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
std::string generation_prompt;
|
||||
bool add_generation_prompt = false;
|
||||
common_chat_continuation continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
|
||||
common_chat_msg continue_msg;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
json extra_context;
|
||||
bool add_bos = false;
|
||||
bool add_eos = false;
|
||||
bool is_inference = true;
|
||||
bool add_inference = false;
|
||||
bool mark_input = true; // whether to mark input strings in the jinja context
|
||||
|
||||
bool has_continuation() const {
|
||||
return continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !continue_msg.empty();
|
||||
}
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
@@ -386,7 +391,7 @@ struct autoparser {
|
||||
void analyze_template(const common_chat_template & tmpl);
|
||||
|
||||
// Build the PEG parser for this template
|
||||
common_peg_arena build_parser(const generation_params & inputs) const;
|
||||
common_peg_arena build_parser(const generation_params & inputs, const std::string & generation_prompt) const;
|
||||
|
||||
private:
|
||||
// Collect tokens from entire analysis to preserve
|
||||
|
||||
@@ -358,35 +358,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||
if (is_potential_container) {
|
||||
value_content = normalize_container_value(value_content);
|
||||
}
|
||||
|
||||
// Try to parse as JSON value (number, bool, null, object, array)
|
||||
try {
|
||||
ordered_json parsed = ordered_json::parse(value_content);
|
||||
if (parsed.is_string()) {
|
||||
// Don't add closing quote yet (added by arg_close) for monotonic streaming
|
||||
std::string escaped = parsed.dump();
|
||||
if (!escaped.empty() && escaped.back() == '"') {
|
||||
escaped.pop_back();
|
||||
}
|
||||
value_to_add = escaped;
|
||||
closing_quote_pending = true;
|
||||
} else {
|
||||
// Non-string values: use raw content to preserve whitespace for monotonicity
|
||||
value_to_add = value_content;
|
||||
}
|
||||
} catch (...) {
|
||||
if (node.is_partial && is_potential_container) {
|
||||
// Partial container: pass through the already-normalized content
|
||||
value_to_add = value_content;
|
||||
} else {
|
||||
// Not valid JSON - treat as string value
|
||||
if (!closing_quote_pending) {
|
||||
value_to_add = "\"";
|
||||
closing_quote_pending = true;
|
||||
}
|
||||
value_to_add += escape_json_string_inner(value_content);
|
||||
}
|
||||
}
|
||||
value_to_add += value_content;
|
||||
}
|
||||
|
||||
args_target() += value_to_add;
|
||||
@@ -813,7 +785,7 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
|
||||
if (delimiter.empty()) {
|
||||
return literal(s);
|
||||
}
|
||||
return literal(s.substr(0, s.rfind(delimiter)));
|
||||
return literal(s.substr(0, s.find(delimiter)));
|
||||
}
|
||||
|
||||
common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
|
||||
|
||||
@@ -90,7 +90,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
|
||||
// Use for schema-declared string types - won't be treated as potential JSON container
|
||||
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
|
||||
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
|
||||
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
|
||||
|
||||
|
||||
// Return a parser that parses the prefix of a string, up to a given delimiter.
|
||||
|
||||
255
common/chat.cpp
255
common/chat.cpp
@@ -70,6 +70,26 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) {
|
||||
return !msg.content.empty() || !msg.tool_calls.empty();
|
||||
}
|
||||
|
||||
std::string common_chat_msg::render_content(const std::string & delimiter) const {
|
||||
if (!content.empty() && !content_parts.empty()) {
|
||||
throw std::runtime_error("Cannot specify both content and content_parts");
|
||||
}
|
||||
if (!content.empty()) {
|
||||
return content;
|
||||
}
|
||||
|
||||
std::string text;
|
||||
for (const auto & part : content_parts) {
|
||||
if (part.type == "text") {
|
||||
if (!text.empty()) {
|
||||
text += delimiter;
|
||||
}
|
||||
text += part.text;
|
||||
}
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
|
||||
if (!content.empty() && !content_parts.empty()) {
|
||||
throw std::runtime_error("Cannot specify both content and content_parts");
|
||||
@@ -451,6 +471,22 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
||||
return result;
|
||||
}
|
||||
|
||||
common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value) {
|
||||
if (value.is_boolean() && value.get<bool>()) {
|
||||
return COMMON_CHAT_CONTINUATION_AUTO;
|
||||
}
|
||||
if (value.is_string()) {
|
||||
auto value_str = value.get<std::string>();
|
||||
if (value_str == "reasoning_content") {
|
||||
return COMMON_CHAT_CONTINUATION_REASONING;
|
||||
}
|
||||
if (value_str == "content") {
|
||||
return COMMON_CHAT_CONTINUATION_CONTENT;
|
||||
}
|
||||
}
|
||||
return COMMON_CHAT_CONTINUATION_NONE;
|
||||
}
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
try {
|
||||
@@ -811,6 +847,36 @@ std::string common_chat_template_direct_apply(
|
||||
return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
|
||||
}
|
||||
|
||||
static std::string common_chat_template_generation_prompt_impl(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs,
|
||||
const std::optional<json> & messages_override = std::nullopt,
|
||||
const std::optional<json> & tools_override = std::nullopt,
|
||||
const std::optional<json> & additional_context = std::nullopt) {
|
||||
|
||||
auto adjusted_messages = messages_override ? *messages_override : inputs.messages;
|
||||
|
||||
autoparser::generation_params params = inputs;
|
||||
params.add_generation_prompt = false;
|
||||
params.continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context);
|
||||
params.add_generation_prompt = true;
|
||||
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context);
|
||||
|
||||
size_t prefix_len = 0;
|
||||
size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
|
||||
while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
|
||||
prefix_len++;
|
||||
}
|
||||
return gen_prompt.substr(prefix_len);
|
||||
}
|
||||
|
||||
std::string common_chat_template_generation_prompt(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
return common_chat_template_generation_prompt_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
@@ -863,6 +929,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
|
||||
data.thinking_start_tag = "[THINK]";
|
||||
data.thinking_end_tag = "[/THINK]";
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
"[THINK]",
|
||||
@@ -871,8 +938,19 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
|
||||
"[ARGS]",
|
||||
};
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = "[THINK]" + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += "[/THINK]" + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, "[THINK]");
|
||||
auto generation_prompt = p.eps();
|
||||
auto reasoning =
|
||||
extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
|
||||
|
||||
@@ -963,6 +1041,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
}
|
||||
|
||||
data.prompt = prompt;
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
|
||||
@@ -972,6 +1051,18 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
"<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
|
||||
};
|
||||
|
||||
// Adjust prompt for continuation
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = "<|start|>assistant<|channel|>analysis<|message|>" + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += "<|end|><|start|>assistant<|channel|>final<|message|>" + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
@@ -1080,12 +1171,14 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
|
||||
if (inputs.add_generation_prompt && string_ends_with(data.prompt, "<turn|>\n")) {
|
||||
// This may happen if the model generates content + tool_call, the
|
||||
// template does not add the model's next turn and confuses the model
|
||||
// from emitting its proper reasoning token sequence.
|
||||
data.prompt += "<|turn>model\n";
|
||||
data.generation_prompt = "<|turn>model\n";
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
|
||||
@@ -1101,13 +1194,25 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
|
||||
"<|turn>",
|
||||
};
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = string_ends_with(data.prompt, "<turn|>\n") ? "<|turn>model\n" : "";
|
||||
data.generation_prompt += "<|channel>thought\n" + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += "<channel|>" + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>"));
|
||||
auto start = p.rule("start", p.optional(p.literal("<|turn>model\n")));
|
||||
|
||||
if (extract_reasoning) {
|
||||
p.rule("thought", p.literal("<|channel>thought") + p.space() + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
|
||||
@@ -1224,15 +1329,22 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = {
|
||||
">>>all",
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
data.generation_prompt = "<|start_header_id|>assistant<|end_header_id|>\n\n>>>all\n" + msg.render_content();
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
// Functionary v3.2 format:
|
||||
// - Normal content: >>>all\n{content}
|
||||
@@ -1244,7 +1356,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
// When no tools, content goes until end
|
||||
auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
|
||||
auto content_until_end = p.literal("all\n") + p.content(p.rest());
|
||||
auto generation_prompt = p.literal(inputs.generation_prompt);
|
||||
auto generation_prompt = p.literal("<|start_header_id|>assistant<|end_header_id|>\n\n>>>");
|
||||
|
||||
// If no tools or tool_choice is NONE, just parse content
|
||||
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
@@ -1318,9 +1430,10 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
"<|tool_calls_section_begin|>",
|
||||
"<|tool_calls_section_end|>",
|
||||
@@ -1343,10 +1456,22 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
||||
|
||||
const std::string THINK_START = "<think>";
|
||||
const std::string THINK_END = "</think>";
|
||||
const std::string GEN_PROMPT = "<|im_assistant|>assistant<|im_middle|>";
|
||||
|
||||
data.thinking_start_tag = THINK_START;
|
||||
data.thinking_end_tag = THINK_END;
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += THINK_END + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
// Kimi K2 Thinking format:
|
||||
// - Reasoning: <think>{reasoning}</think>
|
||||
@@ -1366,7 +1491,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
||||
auto reasoning = extract_reasoning ? p.optional(THINK_START + p.reasoning(
|
||||
p.until_one_of({ THINK_END, "<|tool_calls_section_begin|>", "<|tool_call_begin|>" })) +
|
||||
p.optional(p.literal(THINK_END))) : p.eps();
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto generation_prompt = p.literal(GEN_PROMPT);
|
||||
|
||||
|
||||
// Content only parser (no tools)
|
||||
@@ -1442,6 +1567,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
@@ -1461,12 +1587,24 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
const std::string TOOL_CALL_END = "<|tool_call_end|>";
|
||||
const std::string THINK_START = "<think>";
|
||||
const std::string THINK_END = "</think>";
|
||||
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
|
||||
|
||||
data.thinking_start_tag = THINK_START;
|
||||
data.thinking_end_tag = THINK_END;
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += THINK_END + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto generation_prompt = p.literal(GEN_PROMPT);
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
@@ -1521,6 +1659,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
@@ -1536,12 +1675,24 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
|
||||
|
||||
const std::string THINK_START = "<think>";
|
||||
const std::string THINK_END = "</think>";
|
||||
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
|
||||
|
||||
data.thinking_start_tag = THINK_START;
|
||||
data.thinking_end_tag = THINK_END;
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += THINK_END + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto generation_prompt = p.literal(GEN_PROMPT);
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
@@ -1592,6 +1743,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = false;
|
||||
data.preserved_tokens = {
|
||||
@@ -1599,6 +1751,12 @@ static common_chat_params common_chat_params_init_gigachat_v3(
|
||||
"<|role_sep|>\n",
|
||||
};
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
data.generation_prompt = "assistant<|role_sep|>\n" + msg.render_content();
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
const auto *tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
|
||||
@@ -1634,7 +1792,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
|
||||
ret = p.content(p.rest());
|
||||
}
|
||||
|
||||
return p.literal(inputs.generation_prompt) + ret;
|
||||
return p.literal("assistant<|role_sep|>\n") + ret;
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
@@ -1662,12 +1820,13 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.thinking_start_tag = "<think>";
|
||||
data.thinking_end_tag = "</think>";
|
||||
data.preserved_tokens = {
|
||||
data.preserved_tokens = {
|
||||
"|DSML|",
|
||||
"<think>",
|
||||
"</think>",
|
||||
@@ -1687,9 +1846,21 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
|
||||
const std::string INVOKE_END = "</" + DSML + "invoke>";
|
||||
const std::string PARAM_START = "<" + DSML + "parameter";
|
||||
const std::string PARAM_END = "</" + DSML + "parameter>";
|
||||
const std::string GEN_PROMPT = "<|Assistant|>";
|
||||
|
||||
if (inputs.has_continuation()) {
|
||||
const auto & msg = inputs.continue_msg;
|
||||
|
||||
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
|
||||
data.generation_prompt += THINK_END + msg.render_content();
|
||||
}
|
||||
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto generation_prompt = p.literal(GEN_PROMPT);
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
@@ -2116,21 +2287,6 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) {
|
||||
autoparser::generation_params params = inputs;
|
||||
params.add_generation_prompt = false;
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
params.add_generation_prompt = true;
|
||||
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
|
||||
size_t prefix_len = 0;
|
||||
size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
|
||||
while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
|
||||
prefix_len++;
|
||||
}
|
||||
return gen_prompt.substr(prefix_len);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates * tmpls,
|
||||
const struct common_chat_templates_inputs & inputs) {
|
||||
autoparser::generation_params params;
|
||||
@@ -2149,6 +2305,27 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
params.add_bos = tmpls->add_bos;
|
||||
params.add_eos = tmpls->add_eos;
|
||||
|
||||
params.continue_final_message = inputs.continue_final_message;
|
||||
if (params.continue_final_message != COMMON_CHAT_CONTINUATION_NONE) {
|
||||
params.add_generation_prompt = false;
|
||||
|
||||
if (!inputs.messages.empty()) {
|
||||
// Render messages[:-1] and store continuation message separately
|
||||
params.continue_msg = inputs.messages.back();
|
||||
params.messages.erase(params.messages.size() - 1);
|
||||
}
|
||||
|
||||
if (params.continue_final_message == COMMON_CHAT_CONTINUATION_AUTO && !inputs.messages.empty()) {
|
||||
// Resolve based on message content
|
||||
params.continue_final_message = COMMON_CHAT_CONTINUATION_CONTENT;
|
||||
if (!params.continue_msg.reasoning_content.empty() &&
|
||||
params.continue_msg.content.empty() &&
|
||||
params.continue_msg.content_parts.empty()) {
|
||||
params.continue_final_message = COMMON_CHAT_CONTINUATION_REASONING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (src.find("<|channel|>") == std::string::npos) {
|
||||
// map developer to system for all models except for GPT-OSS
|
||||
workaround::map_developer_role_to_system(params.messages);
|
||||
@@ -2169,8 +2346,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
workaround::func_args_not_string(params.messages);
|
||||
}
|
||||
|
||||
params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params);
|
||||
|
||||
params.extra_context = common_chat_extra_context();
|
||||
for (auto el : inputs.chat_template_kwargs) {
|
||||
params.extra_context[el.first] = json::parse(el.second);
|
||||
@@ -2200,17 +2375,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
auto params_copy = params;
|
||||
params_copy.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, params_copy);
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, params);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.generation_prompt = params.generation_prompt;
|
||||
auto parser = build_chat_peg_parser([¶ms](common_chat_peg_builder &p) {
|
||||
return p.prefix(params.generation_prompt) << p.content(p.rest());
|
||||
auto parser = build_chat_peg_parser([&data](common_chat_peg_builder &p) {
|
||||
return p.literal(data.generation_prompt) << p.content(p.rest());
|
||||
});
|
||||
data.parser = parser.save();
|
||||
return data;
|
||||
}
|
||||
|
||||
if (auto result = common_chat_try_specialized_template(tmpl, src, params)) {
|
||||
result->generation_prompt = params.generation_prompt;
|
||||
return *result;
|
||||
}
|
||||
|
||||
@@ -2224,7 +2398,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
|
||||
auto_params.thinking_end_tag = trim_whitespace(autoparser.reasoning.end);
|
||||
}
|
||||
auto_params.generation_prompt = params.generation_prompt;
|
||||
common_peg_arena arena;
|
||||
arena.load(auto_params.parser);
|
||||
LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str());
|
||||
|
||||
@@ -89,6 +89,8 @@ struct common_chat_msg {
|
||||
|
||||
nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;
|
||||
|
||||
std::string render_content(const std::string & delimiter = "\n\n") const;
|
||||
|
||||
bool empty() const {
|
||||
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() &&
|
||||
tool_name.empty() && tool_call_id.empty();
|
||||
@@ -164,12 +166,22 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
|
||||
|
||||
// Continuation method provided via `continue_final_message`
|
||||
enum common_chat_continuation {
|
||||
COMMON_CHAT_CONTINUATION_NONE,
|
||||
COMMON_CHAT_CONTINUATION_AUTO,
|
||||
COMMON_CHAT_CONTINUATION_REASONING,
|
||||
COMMON_CHAT_CONTINUATION_CONTENT,
|
||||
};
|
||||
|
||||
struct common_chat_templates_inputs {
|
||||
std::vector<common_chat_msg> messages;
|
||||
std::string grammar;
|
||||
std::string json_schema;
|
||||
bool add_generation_prompt = true;
|
||||
bool use_jinja = true;
|
||||
bool add_generation_prompt = true;
|
||||
common_chat_continuation continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
|
||||
bool use_jinja = true;
|
||||
// Parameters below only supported when use_jinja is true
|
||||
std::vector<common_chat_tool> tools;
|
||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
@@ -207,6 +219,7 @@ struct common_chat_parser_params {
|
||||
bool reasoning_in_content = false;
|
||||
std::string generation_prompt;
|
||||
bool parse_tool_calls = true;
|
||||
bool echo = false; // Include assistant prefilled msg in output
|
||||
bool debug = false; // Enable debug output for PEG parser
|
||||
common_peg_arena parser = {};
|
||||
common_chat_parser_params() = default;
|
||||
@@ -267,6 +280,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::or
|
||||
|
||||
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
|
||||
|
||||
common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value);
|
||||
|
||||
// DEPRECATED: only used in tests
|
||||
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
|
||||
|
||||
@@ -279,6 +294,10 @@ std::string common_chat_template_direct_apply(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs);
|
||||
|
||||
std::string common_chat_template_generation_prompt(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs);
|
||||
|
||||
std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
|
||||
@@ -373,7 +373,7 @@ void common_init() {
|
||||
llama_log_set(common_log_default_callback, NULL);
|
||||
}
|
||||
|
||||
void common_params_print_info(const common_params & params) {
|
||||
void common_params_print_info(const common_params & params, bool print_devices) {
|
||||
#ifdef NDEBUG
|
||||
const char * build_type = "";
|
||||
#else
|
||||
@@ -382,12 +382,16 @@ void common_params_print_info(const common_params & params) {
|
||||
LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
|
||||
LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
|
||||
LOG_INF("device_info:\n");
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
|
||||
// device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
|
||||
if (print_devices) {
|
||||
LOG_INF("device_info:\n");
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
@@ -1156,7 +1160,7 @@ struct common_init_result::impl {
|
||||
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
||||
};
|
||||
|
||||
common_init_result::common_init_result(common_params & params) :
|
||||
common_init_result::common_init_result(common_params & params, bool model_only) :
|
||||
pimpl(new impl{}) {
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
@@ -1179,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) :
|
||||
|
||||
pimpl->model.reset(model);
|
||||
|
||||
if (model_only) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
@@ -1248,29 +1256,6 @@ common_init_result::common_init_result(common_params & params) :
|
||||
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
||||
}
|
||||
|
||||
// [TAG_RS_STATE_ROLLBACK_SUPPORT]
|
||||
// TODO: ngram speculative methods require checkpointing in addition to partial RS rollback
|
||||
// currently this is not supported. so we disable the partial rollback
|
||||
if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) {
|
||||
auto & types = params.speculative.types;
|
||||
|
||||
for (int i = 0; i < (int) types.size(); i++) {
|
||||
if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
continue;
|
||||
}
|
||||
if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cparams.n_rs_seq = 0;
|
||||
|
||||
LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__,
|
||||
common_speculative_type_to_str(types[i]).c_str());
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
@@ -1305,8 +1290,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||
return pimpl->lora;
|
||||
}
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
common_init_result_ptr res(new common_init_result(params));
|
||||
common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
|
||||
common_init_result_ptr res(new common_init_result(params, model_only));
|
||||
|
||||
llama_model * model = res->model();
|
||||
if (model == NULL) {
|
||||
@@ -1314,6 +1299,10 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
return res;
|
||||
}
|
||||
|
||||
if (model_only) {
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_context * lctx = res->context();
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
|
||||
@@ -299,11 +299,11 @@ struct common_params_model {
|
||||
|
||||
// draft-model-based speculative decoding parameters
|
||||
struct common_params_speculative_draft {
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.0f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
common_params_model mparams;
|
||||
|
||||
@@ -617,8 +617,6 @@ struct common_params {
|
||||
// UI configs
|
||||
#ifdef LLAMA_UI_DEFAULT_ENABLED
|
||||
bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
|
||||
#elif defined(LLAMA_WEBUI_DEFAULT_ENABLED)
|
||||
bool ui = LLAMA_WEBUI_DEFAULT_ENABLED != 0;
|
||||
#else
|
||||
bool ui = true; // default to enabled when not set
|
||||
#endif
|
||||
@@ -708,7 +706,7 @@ struct common_params {
|
||||
// initializes the logging system and prints info about the build
|
||||
void common_init();
|
||||
|
||||
void common_params_print_info(const common_params & params);
|
||||
void common_params_print_info(const common_params & params, bool print_devices = true);
|
||||
std::string common_params_get_system_info(const common_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
@@ -859,7 +857,7 @@ struct common_sampler;
|
||||
|
||||
// note: defines the model, context, samplers, ets. lifetimes
|
||||
struct common_init_result {
|
||||
common_init_result(common_params & params);
|
||||
common_init_result(common_params & params, bool model_only = false);
|
||||
~common_init_result();
|
||||
|
||||
llama_model * model();
|
||||
@@ -877,7 +875,7 @@ private:
|
||||
|
||||
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params);
|
||||
common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <atomic>
|
||||
#include <regex> // migration only
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <stdexcept>
|
||||
@@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id,
|
||||
if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
|
||||
file.oid = item["lfs"]["oid"].get<std::string>();
|
||||
}
|
||||
if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
|
||||
file.size = item["lfs"]["size"].get<size_t>();
|
||||
}
|
||||
} else if (item.contains("oid") && item["oid"].is_string()) {
|
||||
file.oid = item["oid"].get<std::string>();
|
||||
}
|
||||
if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
|
||||
file.size = item["size"].get<size_t>();
|
||||
}
|
||||
|
||||
if (!file.oid.empty() && !is_valid_oid(file.oid)) {
|
||||
LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
|
||||
@@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) {
|
||||
return file.final_path;
|
||||
}
|
||||
|
||||
// delete everything after this line, one day
|
||||
|
||||
// copied from download.cpp without the tag part
|
||||
struct gguf_split_info {
|
||||
std::string prefix; // tag included
|
||||
int index;
|
||||
int count;
|
||||
};
|
||||
|
||||
static gguf_split_info get_gguf_split_info(const std::string & path) {
|
||||
static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
|
||||
std::smatch m;
|
||||
|
||||
std::string prefix = path;
|
||||
if (!string_remove_suffix(prefix, ".gguf")) {
|
||||
return {};
|
||||
}
|
||||
|
||||
int index = 1;
|
||||
int count = 1;
|
||||
|
||||
if (std::regex_match(prefix, m, re_split)) {
|
||||
index = std::stoi(m[2].str());
|
||||
count = std::stoi(m[3].str());
|
||||
prefix = m[1].str();
|
||||
}
|
||||
|
||||
return {std::move(prefix), index, count};
|
||||
}
|
||||
|
||||
static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
|
||||
static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
|
||||
std::smatch match;
|
||||
if (std::regex_match(filename, match, re)) {
|
||||
return {match[1].str(), match[2].str()};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
static std::string make_old_cache_filename(const std::string & owner,
|
||||
const std::string & repo,
|
||||
const std::string & filename) {
|
||||
auto result = owner + "_" + repo + "_" + filename;
|
||||
string_replace_all(result, "/", "_");
|
||||
return result;
|
||||
}
|
||||
|
||||
struct migrate_file {
|
||||
std::string path;
|
||||
std::string sha256;
|
||||
size_t size;
|
||||
fs::path old_path;
|
||||
fs::path etag_path;
|
||||
const hf_file * file;
|
||||
};
|
||||
|
||||
using migrate_files = std::vector<migrate_file>;
|
||||
|
||||
static bool collect_file(const fs::path & old_cache,
|
||||
const std::string & owner,
|
||||
const std::string & repo,
|
||||
const std::string & path,
|
||||
const std::string & sha256,
|
||||
const hf_files & files,
|
||||
migrate_files & to_migrate) {
|
||||
|
||||
const hf_file * file = nullptr;
|
||||
|
||||
for (const auto & f : files) {
|
||||
if (f.path == path) {
|
||||
file = &f;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::string old_filename = make_old_cache_filename(owner, repo, path);
|
||||
fs::path old_path = old_cache / old_filename;
|
||||
fs::path etag_path = old_path.string() + ".etag";
|
||||
|
||||
if (!fs::exists(old_path)) {
|
||||
if (file && fs::exists(file->final_path)) {
|
||||
return true;
|
||||
}
|
||||
LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!file) {
|
||||
LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
|
||||
LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (file->size > 0) {
|
||||
size_t size = fs::file_size(old_path);
|
||||
if (size != file->size) {
|
||||
LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool collect_files(const fs::path & old_cache,
|
||||
const std::string & owner,
|
||||
const std::string & repo,
|
||||
const nl::json & node,
|
||||
const hf_files & files,
|
||||
migrate_files & to_migrate) {
|
||||
|
||||
if (!node.contains("rfilename") ||
|
||||
!node.contains("lfs") ||
|
||||
!node["lfs"].contains("sha256")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string path = node["rfilename"];
|
||||
std::string sha256 = node["lfs"]["sha256"];
|
||||
|
||||
auto split = get_gguf_split_info(path);
|
||||
|
||||
if (split.count <= 1) {
|
||||
return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> splits;
|
||||
|
||||
for (const auto & f : files) {
|
||||
auto split_f = get_gguf_split_info(f.path);
|
||||
if (split_f.count == split.count && split_f.prefix == split.prefix) {
|
||||
// sadly the manifest only provides the sha256 of the first file (index == 1)
|
||||
// the rest will be verified using the size...
|
||||
std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
|
||||
splits.emplace_back(f.path, f_sha256);
|
||||
}
|
||||
}
|
||||
|
||||
if ((int)splits.size() != split.count) {
|
||||
LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto & [f_path, f_sha256] : splits) {
|
||||
if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool migrate_file(const migrate_file & file) {
|
||||
std::error_code ec;
|
||||
|
||||
fs::path new_path(file.file->local_path);
|
||||
fs::create_directories(new_path.parent_path(), ec);
|
||||
|
||||
if (!fs::exists(new_path, ec)) {
|
||||
fs::rename(file.old_path, new_path, ec);
|
||||
if (ec) {
|
||||
fs::copy_file(file.old_path, new_path, ec);
|
||||
if (ec) {
|
||||
LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fs::remove(file.old_path, ec);
|
||||
}
|
||||
fs::remove(file.etag_path, ec);
|
||||
|
||||
std::string filename = finalize_file(*file.file);
|
||||
LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
|
||||
return true;
|
||||
}
|
||||
|
||||
void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
|
||||
fs::path old_cache = fs_get_cache_directory();
|
||||
if (!fs::exists(old_cache)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (offline) {
|
||||
LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__);
|
||||
return; // -hf is not going to work
|
||||
}
|
||||
|
||||
bool warned = false;
|
||||
|
||||
for (const auto & entry : fs::directory_iterator(old_cache)) {
|
||||
if (!entry.is_regular_file()) {
|
||||
continue;
|
||||
}
|
||||
auto filename = entry.path().filename().string();
|
||||
auto [owner, repo] = parse_manifest_name(filename);
|
||||
|
||||
if (owner.empty() || repo.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!warned) {
|
||||
warned = true;
|
||||
LOG_WRN("================================================================================\n"
|
||||
"WARNING: Migrating cache to HuggingFace cache directory\n"
|
||||
" Old cache: %s\n"
|
||||
" New cache: %s\n"
|
||||
"This one-time migration moves models previously downloaded with -hf\n"
|
||||
"from the legacy llama.cpp cache to the standard HuggingFace cache.\n"
|
||||
"Models downloaded with --model-url are not affected.\n"
|
||||
"================================================================================\n",
|
||||
old_cache.string().c_str(), get_cache_directory().string().c_str());
|
||||
}
|
||||
|
||||
auto repo_id = owner + "/" + repo;
|
||||
auto files = get_repo_files(repo_id, token);
|
||||
|
||||
if (files.empty()) {
|
||||
LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
migrate_files to_migrate;
|
||||
bool ok = true;
|
||||
|
||||
try {
|
||||
std::ifstream manifest(entry.path());
|
||||
auto json = nl::json::parse(manifest);
|
||||
for (const char * key : {"ggufFile", "mmprojFile"}) {
|
||||
if (json.contains(key)) {
|
||||
if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const auto & file : to_migrate) {
|
||||
if (!migrate_file(file)) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
|
||||
fs::remove(entry.path());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf_cache
|
||||
|
||||
@@ -14,7 +14,6 @@ struct hf_file {
|
||||
std::string final_path;
|
||||
std::string oid;
|
||||
std::string repo_id;
|
||||
size_t size = 0; // only for the migration
|
||||
};
|
||||
|
||||
using hf_files = std::vector<hf_file>;
|
||||
@@ -30,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {});
|
||||
// Create snapshot path (link or move/copy) and return it
|
||||
std::string finalize_file(const hf_file & file);
|
||||
|
||||
// TODO: Remove later
|
||||
void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false);
|
||||
|
||||
} // namespace hf_cache
|
||||
|
||||
@@ -471,7 +471,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
sum_occur += curr_occur;
|
||||
}
|
||||
|
||||
LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
|
||||
LOG_DBG("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
|
||||
key_offset,
|
||||
max_occur, sum_occur, slot_max,
|
||||
curr_key.values[0].value_idx, curr_key.values[0].value_num,
|
||||
@@ -482,7 +482,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
// Print the tokens of the four values (if idx != 0), use LOG_INF
|
||||
for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
|
||||
if (curr_key.values[v].value_idx != 0) {
|
||||
LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
|
||||
LOG_DBG("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
draft.push_back(inp[match_pos + n + i]);
|
||||
}
|
||||
|
||||
LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
|
||||
LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
|
||||
key_offset, slot_max,
|
||||
curr_key.key_num, draft.size());
|
||||
|
||||
|
||||
@@ -32,6 +32,19 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
|
||||
{"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
|
||||
};
|
||||
|
||||
static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "default";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (i > 0) result += ", ";
|
||||
result += ggml_backend_dev_name(devices[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
struct common_speculative_config {
|
||||
common_speculative_type type;
|
||||
common_params_speculative params;
|
||||
@@ -144,10 +157,13 @@ struct common_speculative_impl {
|
||||
|
||||
virtual void draft(common_speculative_draft_params_vec & dparams) = 0;
|
||||
|
||||
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0;
|
||||
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
|
||||
|
||||
// true if this implementation requires the target context to extract embeddings
|
||||
// true if this implementation requires the target context to extract post-norm embeddings
|
||||
virtual bool need_embd() const = 0;
|
||||
|
||||
// true if this implementation requires the target context to extract pre-norm embeddings
|
||||
virtual bool need_embd_pre_norm() const { return false; }
|
||||
};
|
||||
|
||||
struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
@@ -164,6 +180,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
ggml_type_name(this->params.cache_type_v),
|
||||
ctx_tgt ? "yes" : "no",
|
||||
ctx_dft ? "yes" : "no",
|
||||
common_speculative_get_devices_str(this->params.devices).c_str());
|
||||
|
||||
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
|
||||
// TODO: optimize or pass from outside?
|
||||
@@ -340,7 +366,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -352,8 +378,12 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
//common_params_speculative_eagle3 params;
|
||||
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {}
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
@@ -368,7 +398,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
// TODO: implement
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -377,7 +407,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft)
|
||||
|
||||
llama_batch batch;
|
||||
@@ -404,7 +434,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
// pre-advancement before process() mirrored the verify batch.
|
||||
std::vector<uint16_t> last_n_drafted;
|
||||
|
||||
common_speculative_state_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
|
||||
common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
|
||||
, params(params.draft)
|
||||
{
|
||||
@@ -414,6 +444,16 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
|
||||
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
ggml_type_name(this->params.cache_type_v),
|
||||
ctx_tgt ? "yes" : "no",
|
||||
ctx_dft ? "yes" : "no",
|
||||
common_speculative_get_devices_str(this->params.devices).c_str());
|
||||
|
||||
const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
|
||||
batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1);
|
||||
// llama_batch_init allocates only one of token/embd; MTP needs both.
|
||||
@@ -424,13 +464,13 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
for (auto & s : smpls) {
|
||||
common_params_sampling sparams;
|
||||
sparams.no_perf = false;
|
||||
sparams.top_k = 1; // TODO: re-enable top_k == 10 and utilize `p_min` spec param
|
||||
sparams.top_k = 10;
|
||||
sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
llama_set_embeddings_pre_norm(ctx_tgt, true);
|
||||
llama_set_embeddings_pre_norm(ctx_dft, true);
|
||||
llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
|
||||
llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
|
||||
|
||||
@@ -443,7 +483,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
last_n_drafted.assign(n_seq, 0);
|
||||
}
|
||||
|
||||
~common_speculative_state_draft_mtp() override {
|
||||
~common_speculative_impl_draft_mtp() override {
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
@@ -459,7 +499,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pos_max < N - 1) {
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d — "
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
|
||||
"process() hook may not have run on every prefill ubatch "
|
||||
"(need_embd / logits=1 on every prompt position?). "
|
||||
"Drafts may degrade.\n",
|
||||
@@ -630,6 +670,14 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
// add drafted token for each sequence
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
auto & dp = dparams.at(seq_id);
|
||||
@@ -675,7 +723,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
return;
|
||||
}
|
||||
@@ -691,6 +739,10 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
}
|
||||
|
||||
bool need_embd() const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool need_embd_pre_norm() const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
@@ -707,7 +759,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
common_ngram_simple_config config)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq)
|
||||
, params(params.ngram_simple)
|
||||
, config(config) {}
|
||||
, config(config)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__);
|
||||
LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__,
|
||||
this->params.size_n, this->params.size_m, this->params.min_hits);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
@@ -731,7 +788,7 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -741,20 +798,21 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
};
|
||||
|
||||
struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
|
||||
common_params_speculative_ngram_map params;
|
||||
|
||||
// n_seq configs
|
||||
std::vector<common_ngram_map> config;
|
||||
|
||||
common_speculative_impl_ngram_map_k(
|
||||
const common_params_speculative & params,
|
||||
const common_ngram_map & config,
|
||||
uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq)
|
||||
, params(params.ngram_map_k) {
|
||||
{
|
||||
for (uint32_t i = 0; i < n_seq; i++) {
|
||||
this->config.push_back(config);
|
||||
}
|
||||
|
||||
LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str());
|
||||
LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__,
|
||||
config.size_key, config.size_value, config.key_only, config.min_hits);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
|
||||
@@ -781,9 +839,13 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override {
|
||||
GGML_ASSERT((seq_id < (llama_seq_id) config.size()));
|
||||
|
||||
if (is_other) {
|
||||
return;
|
||||
}
|
||||
|
||||
common_ngram_map_accept(config[seq_id], n_accepted);
|
||||
}
|
||||
|
||||
@@ -805,7 +867,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
// the last position in the prompt that was added to the ngram container
|
||||
size_t i_last = 0;
|
||||
|
||||
// length of the last drafted n‑gram (number of tokens returned by draft)
|
||||
// length of the last drafted n-gram (number of tokens returned by draft)
|
||||
size_t n_draft_last = 0;
|
||||
|
||||
// consecutive accept rounds with low acceptance fraction (< 0.5)
|
||||
@@ -823,8 +885,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
, verbose(std::getenv("LLAMA_TRACE") != nullptr) {
|
||||
static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
|
||||
|
||||
LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__,
|
||||
this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024);
|
||||
LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__);
|
||||
LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__,
|
||||
this->params.n_match, this->params.n_max, this->params.n_min);
|
||||
LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__,
|
||||
mod.size(), (float)(mod.size_bytes())/1024/1024);
|
||||
|
||||
if (this->params.n_match < 16) {
|
||||
LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
|
||||
@@ -914,7 +979,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
}
|
||||
result.resize(result.size() - n);
|
||||
|
||||
// store length of drafted n‑gram for later acceptance analysis
|
||||
// store length of drafted n-gram for later acceptance analysis
|
||||
sinfo.n_draft_last = result.size();
|
||||
}
|
||||
|
||||
@@ -936,17 +1001,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override {
|
||||
if (is_other) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto & sinfo = sinfos[seq_id];
|
||||
|
||||
// compute acceptance fraction if we have a recorded draft length
|
||||
if (sinfo.n_draft_last > 0) {
|
||||
const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last;
|
||||
if (f_acc < 0.5) {
|
||||
if (f_acc < 0.25) {
|
||||
sinfo.n_low++;
|
||||
if (sinfo.n_low >= 3) {
|
||||
if (sinfo.n_low >= 5) {
|
||||
if (verbose) {
|
||||
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low);
|
||||
LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low);
|
||||
}
|
||||
|
||||
mod.reset();
|
||||
@@ -996,6 +1065,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
|
||||
, save_dynamic(save_dynamic)
|
||||
, save_static(save_static)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__);
|
||||
LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__,
|
||||
n_draft,
|
||||
path_static.empty() ? "none" : path_static.c_str(),
|
||||
path_dynamic.empty() ? "none" : path_dynamic.c_str());
|
||||
|
||||
sinfos.resize(n_seq);
|
||||
|
||||
if (!path_static.empty()) {
|
||||
@@ -1092,7 +1167,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -1278,7 +1353,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
std::vector<std::unique_ptr<common_speculative_impl>> impls = {};
|
||||
|
||||
for (const common_speculative_config & config : configs) {
|
||||
LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str());
|
||||
switch (config.type) {
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
break;
|
||||
@@ -1291,7 +1365,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: {
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft_mtp>(config.params, n_seq));
|
||||
impls.push_back(std::make_unique<common_speculative_impl_draft_mtp>(config.params, n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
|
||||
@@ -1312,11 +1386,16 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
impls.push_back(std::move(state));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: {
|
||||
impls.push_back(
|
||||
std::make_unique<common_speculative_impl_ngram_map_k>(
|
||||
get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
|
||||
impls.push_back(
|
||||
std::make_unique<common_speculative_impl_ngram_map_k>(
|
||||
config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
|
||||
get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
|
||||
@@ -1408,6 +1487,20 @@ bool common_speculative_need_embd(common_speculative * spec) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool common_speculative_need_embd_pre_norm(common_speculative * spec) {
|
||||
if (spec == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (auto & impl : spec->impls) {
|
||||
if (impl->need_embd_pre_norm()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void common_speculative_draft(common_speculative * spec) {
|
||||
if (spec == nullptr) {
|
||||
return;
|
||||
@@ -1494,11 +1587,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
|
||||
|
||||
GGML_ASSERT(impl);
|
||||
|
||||
// TODO: currently only the implementation that generated the draft is used to accept it
|
||||
// however, some implementations (such as MTP) need to also "see" the accepted tokens
|
||||
// extend `common_speculative_impl::accept()` with an extra argument `bool is_other` to
|
||||
// inform the implementation if the accepted tokens are from another implementation and
|
||||
// pass the accepted tokens to all remaining implementations using `is_other == true`
|
||||
{
|
||||
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
|
||||
if (n_accepted > 0) {
|
||||
@@ -1506,9 +1594,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
|
||||
impl->n_acc_tokens += n_accepted;
|
||||
}
|
||||
|
||||
impl->accept(seq_id, n_accepted);
|
||||
impl->accept(seq_id, n_accepted, false);
|
||||
impl->n_call_accept++;
|
||||
}
|
||||
|
||||
// accept with the rest of the implementations, using is_other == true
|
||||
for (auto & impl_other : spec->impls) {
|
||||
if (impl_other.get() != impl) {
|
||||
impl_other->accept(seq_id, n_accepted, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void common_speculative_print_stats(const common_speculative * spec) {
|
||||
@@ -1528,7 +1623,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
|
||||
str_perf = "";
|
||||
}
|
||||
|
||||
LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
|
||||
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
|
||||
common_speculative_type_to_str(impl->type).c_str(),
|
||||
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
|
||||
impl->n_gen_drafts,
|
||||
|
||||
@@ -53,9 +53,12 @@ void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, co
|
||||
// process the batch and update the internal state of the speculative context
|
||||
bool common_speculative_process(common_speculative * spec, const llama_batch & batch);
|
||||
|
||||
// true if any implementation requires target embeddings to be extracted
|
||||
// true if any implementation requires target post-norm embeddings to be extracted
|
||||
bool common_speculative_need_embd(common_speculative * spec);
|
||||
|
||||
// true if any implementation requires target pre-norm embeddings to be extracted
|
||||
bool common_speculative_need_embd_pre_norm(common_speculative * spec);
|
||||
|
||||
// generate drafts for the sequences specified with `common_speculative_get_draft_params`
|
||||
void common_speculative_draft(common_speculative * spec);
|
||||
|
||||
|
||||
@@ -600,6 +600,7 @@ class _Qwen35MtpMixin:
|
||||
if name.find("layers.") != -1:
|
||||
assert bid is not None
|
||||
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
|
||||
bid = bid + n_layer
|
||||
else:
|
||||
remapper = {
|
||||
"mtp.fc": "model.layers.{bid}.eh_proj",
|
||||
|
||||
@@ -445,6 +445,11 @@ if __name__ == '__main__':
|
||||
if self.lazy:
|
||||
tensor = LazyTorchTensor.from_eager(tensor)
|
||||
base_name = get_base_tensor_name(name)
|
||||
# filter base name, ignore tensor transformations for now
|
||||
data_gen = lambda g=tensor: g # noqa: E731
|
||||
if (titem := self.filter_tensors((base_name, data_gen))) is None:
|
||||
continue
|
||||
base_name, _ = titem
|
||||
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
- [News](#news)
|
||||
- [OS](#os)
|
||||
- [Hardware](#hardware)
|
||||
- [Performance Reference](#performance-reference)
|
||||
- [Docker](#docker)
|
||||
- [Linux](#linux)
|
||||
- [Windows](#windows)
|
||||
@@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on
|
||||
|
||||
## News
|
||||
|
||||
- 2026.04
|
||||
|
||||
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
|
||||
- 2026.04-05
|
||||
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0.
|
||||
- Fused MoE.
|
||||
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
|
||||
|
||||
@@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
||||
|
||||
NA
|
||||
|
||||
## Performance Reference
|
||||
|
||||
|
||||
To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313).
|
||||
|
||||
You could update your test result in it directly.
|
||||
|
||||
## Docker
|
||||
|
||||
The docker build option is currently limited to *Intel GPU* targets.
|
||||
|
||||
@@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
### General Speculative Parameters
|
||||
|
||||
```
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
comma-separated list of types of speculative decoding to use
|
||||
(default: none)
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
--spec-default use default speculative decoding
|
||||
--spec-default use default speculative decoding config
|
||||
(enables ngram-mod)
|
||||
```
|
||||
|
||||
### Draft Model Parameters
|
||||
@@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_MODEL)
|
||||
--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
|
||||
HuggingFace repository for the draft model
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO)
|
||||
--spec-draft-n-max N
|
||||
number of tokens to draft for speculative decoding (default: 16)
|
||||
number of tokens to draft for speculative decoding (default: 3)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
|
||||
--spec-draft-n-min N
|
||||
minimum number of draft tokens to use for speculative decoding (default: 0)
|
||||
@@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
speculative decoding split probability (default: 0.10)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
|
||||
--spec-draft-p-min, --draft-p-min P
|
||||
minimum speculative decoding probability (greedy) (default: 0.75)
|
||||
minimum speculative decoding probability (greedy) (default: 0.00)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
|
||||
--spec-draft-ctx-size, -cd, --ctx-size-draft N
|
||||
size of the prompt context for the draft model (default: 0, 0 = loaded from model)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
|
||||
--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N
|
||||
max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
|
||||
--spec-draft-device, -devd, --device-draft <dev1,dev2,..>
|
||||
comma-separated list of devices to use for offloading the draft model
|
||||
--spec-draft-replace, --spec-replace TARGET DRAFT
|
||||
translate the string in TARGET into DRAFT if the draft model and main model are not compatible
|
||||
(use --list-devices to see available devices)
|
||||
```
|
||||
|
||||
### Draft Model CPU Scheduling Parameters
|
||||
|
||||
```
|
||||
--spec-draft-threads, -td, --threads-draft N
|
||||
number of CPU threads to use during generation
|
||||
--spec-draft-threads-batch, -tbd, --threads-batch-draft N
|
||||
number of threads to use during batch and prompt processing (default: same as --threads-draft)
|
||||
--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M
|
||||
Draft model CPU affinity mask. Complements cpu-range-draft
|
||||
--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi
|
||||
Ranges of CPUs for affinity. Complements --cpu-mask-draft
|
||||
--spec-draft-cpu-strict, --cpu-strict-draft <0|1>
|
||||
Use strict CPU placement for draft model (default: same as --cpu-strict)
|
||||
--spec-draft-prio, --prio-draft N
|
||||
set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime
|
||||
--spec-draft-poll, --poll-draft <0|1>
|
||||
Use polling to wait for draft model work (default: same as --poll)
|
||||
--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M
|
||||
Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft
|
||||
--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi
|
||||
Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft
|
||||
--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1>
|
||||
Use strict CPU placement for draft model batch (default: --cpu-strict-draft)
|
||||
--spec-draft-prio-batch, --prio-batch-draft N
|
||||
set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime
|
||||
--spec-draft-poll-batch, --poll-batch-draft <0|1>
|
||||
Use polling to wait for draft model work for batch (default: --poll-draft)
|
||||
```
|
||||
|
||||
### Draft Model KV Cache and Tensor Override Parameters
|
||||
|
||||
```
|
||||
--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE
|
||||
KV cache data type for K for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K)
|
||||
--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE
|
||||
KV cache data type for V for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V)
|
||||
--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type for draft model
|
||||
--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft
|
||||
keep all Mixture of Experts (MoE) weights in the CPU for the draft model
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE)
|
||||
--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N
|
||||
keep the MoE weights of the first N layers in the CPU for the draft model
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE)
|
||||
```
|
||||
|
||||
### n-gram Mod Parameters
|
||||
@@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
|
||||
### `--spec-type TYPE`
|
||||
|
||||
Specifies a type of speculative decoding without draft model.
|
||||
Specifies a comma-separated list of speculative decoding types to use.
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft-simple` | Use a simple draft model for speculation |
|
||||
| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
|
||||
@@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model.
|
||||
./llama-server [...] --spec-type ngram-simple
|
||||
```
|
||||
|
||||
**Example:** Multiple speculative implementations.
|
||||
```bash
|
||||
./llama-server [...] --spec-type ngram-mod,ngram-map-k4v
|
||||
```
|
||||
|
||||
### `--spec-ngram-*-size-n N`
|
||||
|
||||
Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
|
||||
|
||||
@@ -149,6 +149,8 @@ class TaskState:
|
||||
t_gen_ms: Optional[float] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
server_name: Optional[str] = None
|
||||
chunk_idx: int = 0
|
||||
problem_idx: int = 0
|
||||
|
||||
|
||||
class EvalState:
|
||||
@@ -233,7 +235,9 @@ class EvalState:
|
||||
tps_gen: Optional[float] = None,
|
||||
t_gen_ms: Optional[float] = None,
|
||||
reasoning_content: Optional[str] = None,
|
||||
server_name: Optional[str] = None
|
||||
server_name: Optional[str] = None,
|
||||
chunk_idx: int = 0,
|
||||
problem_idx: int = 0,
|
||||
):
|
||||
with self._lock:
|
||||
if "cases" not in self.task_states:
|
||||
@@ -252,7 +256,9 @@ class EvalState:
|
||||
"tps_gen": tps_gen,
|
||||
"t_gen_ms": t_gen_ms,
|
||||
"reasoning_content": reasoning_content,
|
||||
"server_name": server_name
|
||||
"server_name": server_name,
|
||||
"chunk_idx": chunk_idx,
|
||||
"problem_idx": problem_idx,
|
||||
}
|
||||
|
||||
self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False))
|
||||
@@ -289,6 +295,9 @@ class EvalState:
|
||||
all_cases = {}
|
||||
for i, task_id in tasks_to_save:
|
||||
question_text, prompt, expected = self.get_case(i)
|
||||
# Extract chunk_idx from task_id for pending cases
|
||||
_parts = task_id.rsplit("_", 2)
|
||||
_chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
|
||||
if task_id in self.task_states.get("cases", {}):
|
||||
all_cases[task_id] = self.task_states["cases"][task_id]
|
||||
else:
|
||||
@@ -306,7 +315,9 @@ class EvalState:
|
||||
"tps_gen": None,
|
||||
"t_gen_ms": None,
|
||||
"reasoning_content": None,
|
||||
"server_name": None
|
||||
"server_name": None,
|
||||
"chunk_idx": _chunk_idx,
|
||||
"problem_idx": i,
|
||||
}
|
||||
|
||||
ci_lower, ci_upper = self.accuracy_ci()
|
||||
@@ -382,11 +393,12 @@ class EvalState:
|
||||
grader_log_str = self._escape_html(json.dumps(grader_log, indent=2))
|
||||
escaped_server = self._escape_html(server_name)
|
||||
|
||||
answer_class = status_class if status == "ok" else ""
|
||||
rows.append(f"""<tr class="task-row" onclick="toggleDetails('{task_id}')">
|
||||
<td>{task_id}</td>
|
||||
<td class="{status_class}">{status_text}</td>
|
||||
<td>{self._escape_html(expected)}</td>
|
||||
<td>{self._escape_html(answer)}</td>
|
||||
<td class="{answer_class}">{self._escape_html(answer)}</td>
|
||||
<td>{tokens_str}</td>
|
||||
<td>{tps_str}</td>
|
||||
<td>{t_gen_str}</td>
|
||||
@@ -405,6 +417,53 @@ class EvalState:
|
||||
|
||||
rows_html = "\n".join(rows)
|
||||
|
||||
# ---- per-problem summary table ----
|
||||
problem_groups: Dict[int, List[Dict[str, Any]]] = {}
|
||||
for _tid, _case in cases.items():
|
||||
if _case.get("status") != "ok":
|
||||
continue
|
||||
_pidx = _case.get("problem_idx")
|
||||
if _pidx is None:
|
||||
_p_parts = _tid.rsplit("_", 2)
|
||||
_pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0
|
||||
problem_groups.setdefault(_pidx, []).append(_case)
|
||||
|
||||
summary_rows_html = ""
|
||||
if problem_groups:
|
||||
def _stat(v, fmt=".1f", avg_fmt=None):
|
||||
if not v:
|
||||
return ("–", "–", "–")
|
||||
af = fmt if avg_fmt is None else avg_fmt
|
||||
return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}")
|
||||
|
||||
summary_data = []
|
||||
for pidx, g in problem_groups.items():
|
||||
runs = len(g)
|
||||
n_ok = sum(1 for c in g if c.get("correct", False))
|
||||
toks = [c["tokens"] for c in g if c.get("tokens") is not None]
|
||||
tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None]
|
||||
tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None]
|
||||
summary_data.append((
|
||||
pidx, runs, n_ok,
|
||||
_stat(toks, "d", ".0f"),
|
||||
_stat(tps),
|
||||
_stat(tg),
|
||||
))
|
||||
|
||||
summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending
|
||||
|
||||
summary_rows_html = "\n".join(
|
||||
f"""<tr class="summary-row">
|
||||
<td>{p:03d}</td>
|
||||
<td>{r}</td>
|
||||
<td>{n}/{r}</td>
|
||||
<td>{tk[0]}</td><td>{tk[1]}</td><td>{tk[2]}</td>
|
||||
<td>{tp[0]}</td><td>{tp[1]}</td><td>{tp[2]}</td>
|
||||
<td>{tg[0]}</td><td>{tg[1]}</td><td>{tg[2]}</td>
|
||||
</tr>"""
|
||||
for p, r, n, tk, tp, tg in summary_data
|
||||
)
|
||||
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
@@ -412,10 +471,10 @@ class EvalState:
|
||||
<title>{self.dataset_type.upper()} Eval</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; margin: 0; padding: 16px; background: #fff; color: #222; }}
|
||||
.bar {{ padding: 8px 0; font-size: 14px; color: #555; }}
|
||||
.bar span {{ margin-right: 20px; }}
|
||||
.bar b {{ color: #222; }}
|
||||
table {{ width: 100%; border-collapse: collapse; font-size: 13px; }}
|
||||
.bar {{ padding: 8px 0; font-size: 13px; color: #555; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; display: grid; grid-template-columns: auto 1fr auto 1fr; gap: 2px 12px; align-items: baseline; }}
|
||||
.bar .label {{ color: #888; }}
|
||||
.bar .value {{ color: #222; }}
|
||||
table {{ width: 100%; border-collapse: collapse; font-size: 13px; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; }}
|
||||
th {{ text-align: left; padding: 6px 8px; border-bottom: 2px solid #ccc; font-weight: 600; }}
|
||||
td {{ padding: 4px 8px; border-bottom: 1px solid #eee; vertical-align: top; }}
|
||||
.task-row {{ cursor: pointer; }}
|
||||
@@ -429,37 +488,88 @@ class EvalState:
|
||||
.details-content {{ padding: 8px 16px; background: #f6f8fa; font-size: 12px; }}
|
||||
.details-content b {{ color: #555; }}
|
||||
.details-content pre {{ background: #fff; border: 1px solid #e1e4e8; padding: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word; margin: 4px 0 8px; }}
|
||||
.summary-table {{ margin-bottom: 16px; font-size: 13px; width: 100%; }}
|
||||
.summary-row {{ background: #fafbfc; }}
|
||||
.summary-row:hover {{ background: #f5f5f5; }}
|
||||
.summary-table th {{ text-align: right; font-weight: 600; }}
|
||||
.summary-table th:first-child {{ text-align: left; }}
|
||||
.summary-table th[colspan] {{ text-align: center; }}
|
||||
.summary-table td {{ text-align: right; }}
|
||||
.summary-table td:first-child {{ text-align: left; }}
|
||||
.tabs {{ display: flex; border-bottom: 2px solid #ddd; margin: 12px 0 0; }}
|
||||
.tab-btn {{ padding: 6px 16px; border: none; background: none; font-size: 13px; cursor: pointer; color: #555; border-bottom: 2px solid transparent; margin-bottom: -2px; font-weight: 500; }}
|
||||
.tab-btn:hover {{ color: #222; }}
|
||||
.tab-btn.active {{ color: #222; border-bottom-color: #222; font-weight: 600; }}
|
||||
.tab-content {{ display: none; }}
|
||||
.tab-content.active {{ display: block; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="bar">
|
||||
<span><b>{self.dataset_type.upper()}</b></span>
|
||||
<span>Model: {self.model_name or 'N/A'}</span>
|
||||
<span>Accuracy: <b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</span>
|
||||
<span>Correct: <span class="correct">{n_correct}</span> / {len(completed)}</span>
|
||||
<span>Pending: {n_pending}</span>
|
||||
<span>Time: {self.total_time:.1f}s</span>
|
||||
<span>Sampling: {sampling_str}</span>
|
||||
<div class="label">Dataset</div><div class="value"><b>{self.dataset_type.upper()}</b></div>
|
||||
<div class="label">Model</div><div class="value"><b>{self.model_name or 'N/A'}</b></div>
|
||||
<div class="label">Accuracy</div><div class="value"><b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</div>
|
||||
<div class="label">Correct</div><div class="value"><span class="correct">{n_correct}</span> / {len(completed)}</div>
|
||||
<div class="label">Pending</div><div class="value">{n_pending}</div>
|
||||
<div class="label">Time</div><div class="value">{self.total_time:.1f}s</div>
|
||||
<div class="label">Sampling</div><div class="value">{sampling_str}</div>
|
||||
</div>
|
||||
<div class="tabs">
|
||||
<button class="tab-btn active" data-tab="detailed" onclick="switchTab(this)">Detailed</button>
|
||||
<button class="tab-btn" data-tab="summary" onclick="switchTab(this)">Summary</button>
|
||||
</div>
|
||||
<div id="tab-detailed" class="tab-content active">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th></th>
|
||||
<th>Gold</th>
|
||||
<th>Answer</th>
|
||||
<th>Tokens</th>
|
||||
<th>T/s</th>
|
||||
<th>Gen s</th>
|
||||
<th>Server</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{rows_html}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div id="tab-summary" class="tab-content">
|
||||
<table class="summary-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Problem</th>
|
||||
<th>Runs</th>
|
||||
<th>Correct</th>
|
||||
<th colspan="3">Tokens</th>
|
||||
<th colspan="3">T/s</th>
|
||||
<th colspan="3">Gen s</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th></th>
|
||||
<th></th>
|
||||
<th></th>
|
||||
<th>min</th><th>avg</th><th>max</th>
|
||||
<th>min</th><th>avg</th><th>max</th>
|
||||
<th>min</th><th>avg</th><th>max</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{summary_rows_html}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th></th>
|
||||
<th>Gold</th>
|
||||
<th>Answer</th>
|
||||
<th>Tokens</th>
|
||||
<th>T/s</th>
|
||||
<th>Gen s</th>
|
||||
<th>Server</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{rows_html}
|
||||
</tbody>
|
||||
</table>
|
||||
<script>
|
||||
function toggleDetails(id) {{ document.getElementById('details-'+id).classList.toggle('open'); }}
|
||||
function switchTab(btn) {{
|
||||
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
||||
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
|
||||
btn.classList.add('active');
|
||||
document.getElementById('tab-'+btn.dataset.tab).classList.add('active');
|
||||
}}
|
||||
</script>
|
||||
</body>
|
||||
</html>"""
|
||||
@@ -1062,12 +1172,19 @@ class Processor:
|
||||
) -> TaskState:
|
||||
question_text, prompt, expected = eval_state.get_case(i)
|
||||
|
||||
# Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}"
|
||||
_parts = task_id.rsplit("_", 2)
|
||||
chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
|
||||
problem_idx = i
|
||||
|
||||
task_state = TaskState(
|
||||
task_id=task_id,
|
||||
prompt=prompt,
|
||||
expected=expected,
|
||||
question_text=question_text,
|
||||
server_name=server_config.name
|
||||
server_name=server_config.name,
|
||||
chunk_idx=chunk_idx,
|
||||
problem_idx=problem_idx,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -1085,7 +1202,8 @@ class Processor:
|
||||
eval_state.add_result(
|
||||
task_id, prompt, expected, result, None,
|
||||
{"finish_reason": finish_reason}, False, task_state.status,
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
|
||||
chunk_idx, problem_idx,
|
||||
)
|
||||
eval_state.dump()
|
||||
return task_state
|
||||
@@ -1108,7 +1226,8 @@ class Processor:
|
||||
eval_state.add_result(
|
||||
task_id, prompt, expected, result, answer,
|
||||
grader_log, is_correct, "ok",
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
|
||||
chunk_idx, problem_idx,
|
||||
)
|
||||
|
||||
eval_state.dump()
|
||||
|
||||
@@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]:
|
||||
return int(match.group(0))
|
||||
|
||||
class AimeDataset:
|
||||
def __init__(self, split: str = "train"):
|
||||
def __init__(self, split: str = "train", dataset_type: str = "aime"):
|
||||
self.split = split
|
||||
self.dataset_type = dataset_type
|
||||
self.questions: List[Dict] = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
def _get_question_text(self, question: Dict) -> str:
|
||||
"""Get question text, handling different dataset field names."""
|
||||
return question.get("problem", question.get("question", ""))
|
||||
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
def _load_dataset(self):
|
||||
if self.dataset_type == "aime":
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
elif self.dataset_type == "aime2025":
|
||||
print(f"Loading AIME2025 dataset...")
|
||||
ds_list = []
|
||||
for config_name in ["AIME2025-I", "AIME2025-II"]:
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test")
|
||||
ds_list.extend(ds)
|
||||
ds = ds_list
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
raise ValueError(f"Unknown dataset type: {self.dataset_type}")
|
||||
|
||||
self.questions = list(ds)
|
||||
print(f"AIME dataset loaded: {len(self.questions)} questions")
|
||||
print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions")
|
||||
|
||||
def find_question(self, request_text: str) -> Optional[Dict]:
|
||||
# Strip common template prefixes to get the actual question text
|
||||
# Templates include things like "Solve the following math problem step by step..."
|
||||
# The actual question usually follows a blank line or after the template instruction
|
||||
cleaned = request_text
|
||||
# Split on double newline and take the part that looks like the problem
|
||||
parts = cleaned.split('\n\n')
|
||||
if len(parts) > 1:
|
||||
# Find the part that's longest (likely the actual problem text)
|
||||
problem_parts = [p for p in parts if len(p.strip()) > 100]
|
||||
if problem_parts:
|
||||
cleaned = max(problem_parts, key=lambda x: len(x))
|
||||
|
||||
best_match = None
|
||||
best_distance = -1
|
||||
best_index = -1
|
||||
|
||||
for i, question in enumerate(self.questions):
|
||||
question_text = question["problem"]
|
||||
request_lower = request_text.lower()
|
||||
question_text = self._get_question_text(question)
|
||||
request_lower = cleaned.lower()
|
||||
question_lower = question_text.lower()
|
||||
|
||||
# Check if question text is contained in the cleaned request
|
||||
if question_lower in request_lower or request_lower in question_lower:
|
||||
debug_log(f"DEBUG: Found substring match at index {i}")
|
||||
return question
|
||||
|
||||
# Exact match
|
||||
if question_lower == request_lower:
|
||||
debug_log(f"DEBUG: Found exact match at index {i}")
|
||||
@@ -118,7 +154,7 @@ class AimeDataset:
|
||||
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
|
||||
return best_match
|
||||
|
||||
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
|
||||
debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...")
|
||||
return None
|
||||
|
||||
def get_answer(self, question: Dict) -> str:
|
||||
@@ -134,15 +170,16 @@ class Simulator:
|
||||
port: int = 8033,
|
||||
host: str = "localhost",
|
||||
success_rate: float = 0.8,
|
||||
dataset_split: str = "train"
|
||||
dataset_split: str = "train",
|
||||
dataset_type: str = "aime"
|
||||
):
|
||||
self.port = port
|
||||
self.host = host
|
||||
self.success_rate = success_rate
|
||||
self.dataset = AimeDataset(dataset_split)
|
||||
self.dataset = AimeDataset(dataset_split, dataset_type)
|
||||
self.eval_state = EvalState(
|
||||
id="aime-2025",
|
||||
tasks=["aime"],
|
||||
id=dataset_type,
|
||||
tasks=[dataset_type],
|
||||
task_states={},
|
||||
sampling_config={"temperature": 0, "max_tokens": 2048}
|
||||
)
|
||||
@@ -159,6 +196,10 @@ class Simulator:
|
||||
else:
|
||||
response_text = self._generate_wrong_answer(question)
|
||||
|
||||
comp_tokens = random.randint(10000, 60000)
|
||||
tps_gen = random.uniform(90.0, 110.0)
|
||||
t_gen_ms = comp_tokens / tps_gen * 1000
|
||||
|
||||
return {
|
||||
"id": f"chatcmpl-{int(time.time())}",
|
||||
"object": "chat.completion",
|
||||
@@ -176,8 +217,12 @@ class Simulator:
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
"completion_tokens": comp_tokens,
|
||||
"total_tokens": 100 + comp_tokens
|
||||
},
|
||||
"timings": {
|
||||
"predicted_ms": t_gen_ms,
|
||||
"predicted_per_second": tps_gen
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,6 +263,12 @@ class Simulator:
|
||||
return response
|
||||
|
||||
class RequestHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == "/v1/models":
|
||||
self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200)
|
||||
return
|
||||
self._send_json({"error": "Not found"}, 404)
|
||||
|
||||
def do_POST(self):
|
||||
if self.path != "/v1/chat/completions":
|
||||
self._send_json({"error": "Not found"}, 404)
|
||||
@@ -280,6 +331,13 @@ def main():
|
||||
default=0.8,
|
||||
help="Success rate 0-1 (default: 0.8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="aime",
|
||||
choices=["aime", "aime2025"],
|
||||
help="Dataset type (default: aime)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-split",
|
||||
type=str,
|
||||
@@ -294,7 +352,8 @@ def main():
|
||||
port=args.port,
|
||||
host=args.host,
|
||||
success_rate=args.success_rate,
|
||||
dataset_split=args.dataset_split
|
||||
dataset_split=args.dataset_split,
|
||||
dataset_type=args.dataset
|
||||
)
|
||||
|
||||
server = HTTPServer((args.host, args.port), RequestHandler)
|
||||
@@ -304,7 +363,7 @@ def main():
|
||||
print("\n=== llama-server-simulator ===")
|
||||
print(f"Server running on http://{args.host}:{args.port}")
|
||||
print(f"Success rate: {args.success_rate}")
|
||||
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print("\nPress Ctrl+C to stop\n")
|
||||
|
||||
try:
|
||||
|
||||
@@ -1,22 +1,296 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <clocale>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
struct llama_batch_ptr {
|
||||
llama_batch batch;
|
||||
|
||||
llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max)
|
||||
: batch{llama_batch_init(n_tokens, embd, n_seq_max)} {}
|
||||
|
||||
~llama_batch_ptr() { llama_batch_free(batch); }
|
||||
|
||||
llama_batch_ptr(const llama_batch_ptr &) = delete;
|
||||
llama_batch_ptr & operator=(const llama_batch_ptr &) = delete;
|
||||
llama_batch_ptr(llama_batch_ptr &&) = default;
|
||||
llama_batch_ptr & operator=(llama_batch_ptr &&) = default;
|
||||
|
||||
llama_batch & get() { return batch; }
|
||||
const llama_batch & get() const { return batch; }
|
||||
};
|
||||
|
||||
static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) {
|
||||
std::string result;
|
||||
llama_batch_ptr batch(1, 0, 1);
|
||||
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||
|
||||
LOG("%s", next_token_str.c_str());
|
||||
result += next_token_str;
|
||||
|
||||
common_batch_clear(batch.get());
|
||||
common_batch_add(batch.get(), next_token, n_past, {seq_id}, true);
|
||||
|
||||
if (llama_decode(ctx, batch.get())) {
|
||||
LOG_ERR("\n%s: failed to evaluate\n", __func__);
|
||||
return {};
|
||||
}
|
||||
n_past++;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Test 1: baseline
|
||||
// - tokenize the prompt
|
||||
// - decode all but the last token
|
||||
// - save state to disk
|
||||
// - decode the last token
|
||||
// - generate n_predict tokens
|
||||
static std::string test_baseline(struct llama_model * model, const struct common_params & params) {
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
auto n_past = 0;
|
||||
if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) {
|
||||
LOG_ERR("%s: failed to decode prompt\n", __func__);
|
||||
return {};
|
||||
}
|
||||
|
||||
LOG("\n=== Test 1: baseline ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0);
|
||||
if (result.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Test 2: state load
|
||||
// - create a new context
|
||||
// - load state from file
|
||||
// - replay the last prompt token
|
||||
// - generate n_predict tokens and compare against expected result
|
||||
static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
LOG("\n=== Test 2: state load ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
// Load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size());
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
LOG_ERR("\n%s: failed to load state\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// Replay last token
|
||||
int n_past = (int) n_token_count_out;
|
||||
if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
|
||||
return false;
|
||||
}
|
||||
n_past++;
|
||||
|
||||
// Generate tokens
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0);
|
||||
if (result.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result != expected_result) {
|
||||
LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("\nPASS\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Test 3: seq copy (host)
|
||||
// - create a multi-seq context
|
||||
// - load state from file
|
||||
// - replay the last prompt token
|
||||
// - migrate KV cache from seq 0 to seq 1 via the CPU path
|
||||
// - generate n_predict tokens on seq 1 and compare against expected result
|
||||
static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
|
||||
auto params_ctx = common_context_params_to_llama(params);
|
||||
params_ctx.n_seq_max = 2;
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
LOG("\n=== Test 3: seq copy (host) ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
// Load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size());
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
LOG_ERR("\n%s: failed to load state\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// Replay last token
|
||||
int n_past = (int) n_token_count_out;
|
||||
if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
|
||||
return false;
|
||||
}
|
||||
n_past++;
|
||||
|
||||
// Migrate KV cache from seq 0 to seq 1 (CPU path)
|
||||
{
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx.get(), 0));
|
||||
const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0);
|
||||
if (ncopy != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx.get()), true);
|
||||
LOG_TRC("%s: kv cache cleared\n", __func__);
|
||||
|
||||
const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1);
|
||||
if (nset != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// Generate tokens on seq 1
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1);
|
||||
if (result.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result != expected_result) {
|
||||
LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("\nPASS\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Test 4: seq copy (device)
|
||||
// - create a multi-seq context
|
||||
// - load state from file
|
||||
// - replay the last prompt token
|
||||
// - migrate KV cache from seq 0 to seq 1 via the on-device path
|
||||
// - generate n_predict tokens on seq 1 and compare against expected result
|
||||
static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
|
||||
auto params_ctx = common_context_params_to_llama(params);
|
||||
params_ctx.n_seq_max = 2;
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
LOG("\n=== Test 4: seq copy (device) ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
// Load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size());
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
LOG_ERR("\n%s: failed to load state\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// Replay last token
|
||||
int n_past = (int) n_token_count_out;
|
||||
if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
|
||||
return false;
|
||||
}
|
||||
n_past++;
|
||||
|
||||
// Migrate KV cache from seq 0 to seq 1 (on-device path)
|
||||
{
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
|
||||
const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (ncopy != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx.get()), true);
|
||||
LOG_TRC("%s: kv cache cleared\n", __func__);
|
||||
|
||||
const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (nset != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// Generate tokens on seq 1
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1);
|
||||
if (result.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result != expected_result) {
|
||||
LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("\nPASS\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
params.prompt = "The quick brown fox";
|
||||
params.out_file = "dump_state.bin";
|
||||
params.sampling.seed = 1234;
|
||||
|
||||
const std::string_view state_file = "dump_state.bin";
|
||||
|
||||
common_init();
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
@@ -24,8 +298,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.n_parallel == 1) {
|
||||
// the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
|
||||
printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
|
||||
LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
|
||||
params.kv_unified = true;
|
||||
}
|
||||
|
||||
@@ -33,288 +306,40 @@ int main(int argc, char ** argv) {
|
||||
params.n_predict = 16;
|
||||
}
|
||||
|
||||
auto n_past = 0;
|
||||
|
||||
std::string result0;
|
||||
std::string result1;
|
||||
std::string result2;
|
||||
std::string result3;
|
||||
|
||||
// init
|
||||
|
||||
ggml_backend_load_all();
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
|
||||
auto llama_init = common_init_from_params(params, true);
|
||||
auto * model = llama_init->model();
|
||||
auto * ctx = llama_init->context();
|
||||
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
if (model == nullptr) {
|
||||
LOG_ERR("%s: failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
GGML_ASSERT(llama_init->context() == nullptr);
|
||||
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
// tokenize prompt
|
||||
auto tokens = common_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const bool save_state = true;
|
||||
if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
|
||||
// Test 1: baseline (saves state to disk)
|
||||
auto result_baseline = test_baseline(model, params);
|
||||
if (result_baseline.empty()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// first run
|
||||
printf("\nfirst run: %s", params.prompt.c_str());
|
||||
|
||||
llama_batch batch = llama_batch_init(1, 0, 1);
|
||||
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result0 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {0}, true);
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
// make new context
|
||||
llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params));
|
||||
|
||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsecond run: %s", params.prompt.c_str());
|
||||
|
||||
// load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
// Test 2: state load
|
||||
if (!test_state_load(model, params, result_baseline)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx2, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result1 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {0}, true);
|
||||
|
||||
if (llama_decode(ctx2, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
if (result0 != result1) {
|
||||
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||
// Test 3: seq copy (host)
|
||||
if (!test_seq_cp_host(model, params, result_baseline)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// make new context
|
||||
auto params_ctx3 = common_context_params_to_llama(params);
|
||||
params_ctx3.n_seq_max = 2;
|
||||
llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
|
||||
|
||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
// Test 4: seq copy (device)
|
||||
if (!test_seq_cp_device(model, params, result_baseline)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// save seq 0 and load into seq 1
|
||||
{
|
||||
// save kv of seq 0
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
||||
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
|
||||
if (ncopy != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_memory_clear(llama_get_memory(ctx3), true);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 1
|
||||
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
|
||||
if (nset != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// third run with seq 1 instead of 0
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {1}, true);
|
||||
|
||||
if (llama_decode(ctx3, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
// test on-device state save/load
|
||||
auto params_ctx4 = common_context_params_to_llama(params);
|
||||
params_ctx4.n_seq_max = 2;
|
||||
llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
|
||||
|
||||
llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// save seq 0 and load into seq 1
|
||||
{
|
||||
// save kv of seq 0
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
|
||||
const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (ncopy != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_memory_clear(llama_get_memory(ctx4), true);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 0
|
||||
const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (nset != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// forth run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl4, ctx4, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx4, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result3 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {1}, true);
|
||||
|
||||
if (llama_decode(ctx4, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
llama_sampler_free(smpl);
|
||||
llama_sampler_free(smpl2);
|
||||
llama_sampler_free(smpl3);
|
||||
llama_sampler_free(smpl4);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
// this one is managed by common_init_result
|
||||
//llama_free(ctx);
|
||||
|
||||
llama_free(ctx2);
|
||||
llama_free(ctx3);
|
||||
llama_free(ctx4);
|
||||
|
||||
if (result0 != result2) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (result0 != result3) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n%s : success\n", __func__);
|
||||
LOG("\nAll tests passed.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -111,7 +111,6 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
|
||||
@@ -119,7 +119,6 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
|
||||
@@ -164,7 +164,6 @@ if not "%GGML_SYCL_DEVICE%"=="-1" (
|
||||
echo Use %GGML_SYCL_DEVICE% as main GPU
|
||||
REM Use single GPU only.
|
||||
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
|
||||
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
|
||||
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
|
||||
) else (
|
||||
echo Use all Intel GPUs, including iGPU ^& dGPU
|
||||
|
||||
@@ -186,7 +186,6 @@ if not "%GGML_SYCL_DEVICE%"=="-1" (
|
||||
echo Use %GGML_SYCL_DEVICE% as main GPU
|
||||
REM Use single GPU only.
|
||||
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
|
||||
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
|
||||
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
|
||||
) else (
|
||||
echo Use all Intel GPUs, including iGPU ^& dGPU
|
||||
|
||||
@@ -140,11 +140,12 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
|
||||
};
|
||||
|
||||
switch (nc) {
|
||||
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
|
||||
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
|
||||
case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
|
||||
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
|
||||
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
|
||||
case 3: launch_kernel(std::integral_constant<int, 3 >{}); break;
|
||||
case 4: launch_kernel(std::integral_constant<int, 4 >{}); break;
|
||||
case 5: launch_kernel(std::integral_constant<int, 5 >{}); break;
|
||||
case 9: launch_kernel(std::integral_constant<int, 9 >{}); break;
|
||||
case 15: launch_kernel(std::integral_constant<int, 15>{}); break;
|
||||
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9, 15 right now.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# include <cub/cub.cuh>
|
||||
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
|
||||
# define CUB_TOP_K_AVAILABLE
|
||||
# include <cuda/iterator>
|
||||
using namespace cub;
|
||||
# endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
@@ -2744,6 +2744,18 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
@@ -2816,6 +2828,21 @@ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) { return false; }
|
||||
if (dst->type != GGML_TYPE_F32) { return false; }
|
||||
if (!ggml_are_same_shape(src0, dst)) { return false; }
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
|
||||
|
||||
return true;
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||
return sess->c_name();
|
||||
@@ -2857,6 +2884,9 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_TRI: return HTP_OP_TRI;
|
||||
case GGML_OP_PAD: return HTP_OP_PAD;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
@@ -3416,6 +3446,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_solve_tri(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_TRI:
|
||||
supp = ggml_hexagon_supported_tri(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_PAD:
|
||||
supp = ggml_hexagon_supported_pad(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
gated-delta-net-ops.c
|
||||
pad-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -107,5 +107,7 @@ int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
int op_gated_delta_net(struct htp_ops_context * octx);
|
||||
int op_tri(struct htp_ops_context * octx);
|
||||
int op_pad(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -86,6 +86,8 @@ enum htp_op_code {
|
||||
HTP_OP_SOLVE_TRI,
|
||||
HTP_OP_L2_NORM,
|
||||
HTP_OP_GATED_DELTA_NET,
|
||||
HTP_OP_TRI,
|
||||
HTP_OP_PAD,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
@@ -595,9 +595,15 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_SOLVE_TRI:
|
||||
return op_solve_tri(octx);
|
||||
|
||||
case HTP_OP_PAD:
|
||||
return op_pad(octx);
|
||||
|
||||
case HTP_OP_GATED_DELTA_NET:
|
||||
return op_gated_delta_net(octx);
|
||||
|
||||
case HTP_OP_TRI:
|
||||
return op_tri(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
|
||||
545
ggml/src/ggml-hexagon/htp/pad-ops.c
Normal file
545
ggml/src/ggml-hexagon/htp/pad-ops.c
Normal file
@@ -0,0 +1,545 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
/* Circular wrap: maps any integer x into [0, n) */
|
||||
static inline uint32_t wrap_around(int32_t x, uint32_t n) {
|
||||
return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n);
|
||||
}
|
||||
|
||||
/* Decompose a flat dst row index into (i1, i2, i3) */
|
||||
static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2,
|
||||
uint32_t *i1, uint32_t *i2, uint32_t *i3) {
|
||||
*i1 = ir % ne1;
|
||||
*i2 = (ir / ne1) % ne2;
|
||||
*i3 = ir / (ne1 * ne2);
|
||||
}
|
||||
|
||||
/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */
|
||||
static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
int32_t lp1, int32_t rp1, uint32_t ne1,
|
||||
int32_t lp2, int32_t rp2, uint32_t ne2,
|
||||
int32_t lp3, int32_t rp3, uint32_t ne3) {
|
||||
return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) &&
|
||||
((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) &&
|
||||
((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3);
|
||||
}
|
||||
|
||||
/* Compute the DDR src row pointer for a zero-pad interior row */
|
||||
static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src,
|
||||
uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
int32_t lp1, int32_t lp2, int32_t lp3) {
|
||||
return (const uint8_t *) src->data
|
||||
+ (i1 - (uint32_t)lp1) * src->nb[1]
|
||||
+ (i2 - (uint32_t)lp2) * src->nb[2]
|
||||
+ (i3 - (uint32_t)lp3) * src->nb[3];
|
||||
}
|
||||
|
||||
/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */
|
||||
static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src,
|
||||
uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
int32_t lp1, int32_t lp2, int32_t lp3) {
|
||||
return (const uint8_t *) src->data
|
||||
+ wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1]
|
||||
+ wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2]
|
||||
+ wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3];
|
||||
}
|
||||
|
||||
struct htp_pad_context {
|
||||
struct htp_ops_context * octx;
|
||||
|
||||
int32_t lp0, rp0;
|
||||
int32_t lp1, rp1;
|
||||
int32_t lp2, rp2;
|
||||
int32_t lp3, rp3;
|
||||
|
||||
uint32_t nrows_per_thread;
|
||||
uint32_t total_dst_rows;
|
||||
|
||||
size_t type_size;
|
||||
|
||||
// Row sizes for DMA kernel (populated when VTCM is available)
|
||||
size_t src_row_size;
|
||||
size_t src_row_size_aligned;
|
||||
size_t dst_row_size;
|
||||
size_t dst_row_size_aligned;
|
||||
};
|
||||
|
||||
#define htp_pad_preamble \
|
||||
const struct htp_tensor * src = octx->src[0]; \
|
||||
const struct htp_tensor * dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne00 = src->ne[0]; \
|
||||
const uint32_t nb00 = src->nb[0]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \
|
||||
const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \
|
||||
const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \
|
||||
const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \
|
||||
\
|
||||
const size_t type_size = pctx->type_size; \
|
||||
\
|
||||
const uint32_t row_start = pctx->nrows_per_thread * ith; \
|
||||
const uint32_t row_end = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows);
|
||||
|
||||
|
||||
#define htp_pad_dma_preamble \
|
||||
const size_t src_row_size = pctx->src_row_size; \
|
||||
const size_t src_row_size_aligned = pctx->src_row_size_aligned; \
|
||||
const size_t dst_row_size = pctx->dst_row_size; \
|
||||
const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \
|
||||
\
|
||||
uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + ith * octx->dst_spad.size_per_thread; \
|
||||
\
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX vectorized PAD kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3);
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
const int interior = pad_is_interior(i1, i2, i3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
|
||||
if (!interior) {
|
||||
hvx_splat_f32_u(dst_ptr, 0.0f, ne0);
|
||||
} else {
|
||||
const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3);
|
||||
|
||||
if (lp0 > 0) {
|
||||
hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0);
|
||||
}
|
||||
|
||||
uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size;
|
||||
if (nb00 == type_size) {
|
||||
hvx_copy_f32_uu(dst_row_start, src_ptr, ne00);
|
||||
} else {
|
||||
for (uint32_t i = 0; i < ne00; i++) {
|
||||
memcpy(dst_row_start + i * type_size,
|
||||
src_ptr + (size_t)i * nb00,
|
||||
type_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (rp0 > 0) {
|
||||
hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX + DMA PAD kernel — aligned, double-buffered
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
htp_pad_dma_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the
|
||||
// double-buffer pipeline before the main loop begins.
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) {
|
||||
uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned;
|
||||
uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr((uint8_t *)dst->data, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 0);
|
||||
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
|
||||
const int interior = pad_is_interior(i1, i2, i3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
|
||||
const uint8_t * src_ptr = interior
|
||||
? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL;
|
||||
|
||||
// Interior row: real DMA (1 row) from DDR to VTCM.
|
||||
// Border row: null DMA (nrows=0)
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur,
|
||||
src_ptr ? src_ptr : (const uint8_t *)src_spad_cur),
|
||||
src_row_size_aligned, src_row_size, src_ptr ? 1 : 0);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops,
|
||||
// push dst DMA and prefetch src for the next+1 row.
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start; ir < row_end; ir++) {
|
||||
uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src;
|
||||
uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst;
|
||||
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
const int interior = pad_is_interior(i1, i2, i3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
|
||||
if (!interior) {
|
||||
hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0);
|
||||
} else {
|
||||
hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0);
|
||||
|
||||
uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size;
|
||||
|
||||
if ((uintptr_t)dst_interior % VLEN == 0) {
|
||||
hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00);
|
||||
} else {
|
||||
hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr(dst_ptr, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
|
||||
const uint32_t next_row = ir + 2;
|
||||
if (next_row < row_end) {
|
||||
uint32_t ni1, ni2, ni3;
|
||||
pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3);
|
||||
const int next_interior = pad_is_interior(ni1, ni2, ni3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
const uint8_t * next_src_ptr = next_interior
|
||||
? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL;
|
||||
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur,
|
||||
next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur),
|
||||
src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_flush(dma);
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX circular PAD kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3);
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3);
|
||||
|
||||
if (nb00 == type_size) {
|
||||
|
||||
if (lp0 > 0) {
|
||||
if ((uint32_t)lp0 < 32) {
|
||||
memcpy(dst_ptr,
|
||||
src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size,
|
||||
(size_t)lp0 * type_size);
|
||||
} else {
|
||||
hvx_copy_f32_uu(dst_ptr,
|
||||
src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size,
|
||||
(uint32_t)lp0);
|
||||
}
|
||||
}
|
||||
hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00);
|
||||
if (rp0 > 0) {
|
||||
if ((uint32_t)rp0 < 32) {
|
||||
memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size,
|
||||
src_row,
|
||||
(size_t)rp0 * type_size);
|
||||
} else {
|
||||
hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size,
|
||||
src_row,
|
||||
(uint32_t)rp0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i = 0; i < (uint32_t)lp0; i++) {
|
||||
*(float *)(dst_ptr + i * type_size) =
|
||||
*(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00);
|
||||
}
|
||||
for (uint32_t i = 0; i < ne00; i++) {
|
||||
*(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) =
|
||||
*(const float *)(src_row + (size_t)i * nb00);
|
||||
}
|
||||
for (uint32_t i = 0; i < (uint32_t)rp0; i++) {
|
||||
*(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) =
|
||||
*(const float *)(src_row + (size_t)i * nb00);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX + DMA circular PAD kernel — aligned, double-buffered
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
htp_pad_dma_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the
|
||||
// double-buffer pipeline. Every row is a real src DMA (no null DMAs).
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) {
|
||||
uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned;
|
||||
uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr((uint8_t *)dst->data, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 0);
|
||||
|
||||
uint32_t pi1, pi2, pi3;
|
||||
pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3);
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)),
|
||||
src_row_size_aligned, src_row_size, 1);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Main loop: pop completed DMAs, assemble circular row in VTCM with
|
||||
// aligned HVX ops, push dst DMA and prefetch src for the next+1 row.
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start; ir < row_end; ir++) {
|
||||
uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src;
|
||||
uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst;
|
||||
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
|
||||
if (lp0 > 0) {
|
||||
uint8_t * dst_left = dst_spad_cur;
|
||||
const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size;
|
||||
if ((uint32_t)lp0 < 32) {
|
||||
memcpy(dst_left, src_left, (size_t)lp0 * type_size);
|
||||
} else {
|
||||
hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size;
|
||||
if ((uintptr_t)dst_mid % VLEN == 0) {
|
||||
hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00);
|
||||
} else {
|
||||
hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00);
|
||||
}
|
||||
}
|
||||
|
||||
if (rp0 > 0) {
|
||||
uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size;
|
||||
if ((uint32_t)rp0 < 32) {
|
||||
memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size);
|
||||
} else {
|
||||
if ((uintptr_t)dst_right % VLEN == 0) {
|
||||
hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0);
|
||||
} else {
|
||||
hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr(dst_ptr, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
|
||||
const uint32_t next_row = ir + 2;
|
||||
if (next_row < row_end) {
|
||||
uint32_t nri1, nri2, nri3;
|
||||
pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3);
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur,
|
||||
pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)),
|
||||
src_row_size_aligned, src_row_size, 1);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_flush(dma);
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_pad(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
// Only F32 supported
|
||||
size_t type_size;
|
||||
switch (src0->type) {
|
||||
case HTP_TYPE_F32: type_size = 4; break;
|
||||
default:
|
||||
FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const int32_t lp0 = octx->op_params[0];
|
||||
const int32_t rp0 = octx->op_params[1];
|
||||
const int32_t lp1 = octx->op_params[2];
|
||||
const int32_t rp1 = octx->op_params[3];
|
||||
const int32_t lp2 = octx->op_params[4];
|
||||
const int32_t rp2 = octx->op_params[5];
|
||||
const int32_t lp3 = octx->op_params[6];
|
||||
const int32_t rp3 = octx->op_params[7];
|
||||
const int32_t circular = octx->op_params[8];
|
||||
|
||||
const uint32_t ne0 = dst->ne[0];
|
||||
const uint32_t ne00 = src0->ne[0];
|
||||
|
||||
const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3];
|
||||
const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1);
|
||||
|
||||
const size_t src_row_size = (size_t)ne00 * type_size;
|
||||
const size_t dst_row_size = (size_t)ne0 * type_size;
|
||||
const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
|
||||
// Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread
|
||||
const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned);
|
||||
|
||||
const int use_dma = (src0->nb[0] == (uint32_t)type_size) &&
|
||||
(ne00 >= 512) &&
|
||||
(octx->ctx->vtcm_base != NULL) &&
|
||||
(octx->ctx->vtcm_size >= vtcm_needed);
|
||||
|
||||
if (use_dma) {
|
||||
octx->src0_spad.size_per_thread = 2 * src_row_size_aligned;
|
||||
octx->dst_spad.size_per_thread = 2 * dst_row_size_aligned;
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
}
|
||||
|
||||
struct htp_pad_context pctx = {
|
||||
.octx = octx,
|
||||
.lp0 = lp0, .rp0 = rp0,
|
||||
.lp1 = lp1, .rp1 = rp1,
|
||||
.lp2 = lp2, .rp2 = rp2,
|
||||
.lp3 = lp3, .rp3 = rp3,
|
||||
.nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads,
|
||||
.total_dst_rows = total_dst_rows,
|
||||
.type_size = type_size,
|
||||
.src_row_size = src_row_size,
|
||||
.src_row_size_aligned = src_row_size_aligned,
|
||||
.dst_row_size = dst_row_size,
|
||||
.dst_row_size_aligned = dst_row_size_aligned,
|
||||
};
|
||||
|
||||
FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n",
|
||||
circular ? "-circ" : "",
|
||||
use_dma ? "-dma" : "",
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
|
||||
|
||||
if (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); }
|
||||
else if (circular) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular, &pctx, n_threads); }
|
||||
else if (use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma, &pctx, n_threads); }
|
||||
else { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx, &pctx, n_threads); }
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
struct htp_unary_context {
|
||||
struct htp_ops_context * octx;
|
||||
@@ -277,6 +276,95 @@ static void sigmoid_f32(const float * restrict src,
|
||||
}
|
||||
}
|
||||
|
||||
static void tri_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params,
|
||||
const uint32_t ir,
|
||||
const struct htp_unary_context * uctx) {
|
||||
|
||||
const int32_t ttype = op_params[0];
|
||||
const HVX_Vector zero = hvx_vec_splat_f32(0.0f);
|
||||
const uint32_t nvec = row_elems / VLEN_FP32;
|
||||
const uint32_t nloe = row_elems % VLEN_FP32;
|
||||
|
||||
const uint32_t ne01 = uctx->octx->src[0]->ne[1];
|
||||
|
||||
for (uint32_t b = 0; b < num_rows; b++) {
|
||||
const uint32_t abs_row = ir + b;
|
||||
const uint32_t i01 = abs_row % ne01;
|
||||
|
||||
const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size);
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) ((uint8_t *) dst + b * row_size);
|
||||
|
||||
uint32_t boundary;
|
||||
int keep_left;
|
||||
switch (ttype) {
|
||||
case 0: boundary = i01; keep_left = 0; break; // keep col >= row
|
||||
case 1: boundary = i01 + 1; keep_left = 0; break; // keep col > row
|
||||
case 2: boundary = i01 + 1; keep_left = 1; break; // keep col <= row
|
||||
case 3: boundary = i01; keep_left = 1; break; // keep col < row
|
||||
default: boundary = 0; keep_left = 0; break;
|
||||
}
|
||||
if (boundary > row_elems) boundary = row_elems;
|
||||
|
||||
// Full HVX vectors — each starts at a 128-byte aligned offset
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
const uint32_t vec_start = i * VLEN_FP32;
|
||||
const uint32_t vec_end = vec_start + VLEN_FP32;
|
||||
if (keep_left) {
|
||||
if (vec_end <= boundary) {
|
||||
v_dst[i] = v_src[i];
|
||||
} else if (vec_start >= boundary) {
|
||||
v_dst[i] = zero;
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
v_dst[i] = Q6_V_vmux_QVV(mask, v_src[i], zero);
|
||||
}
|
||||
} else {
|
||||
if (vec_end <= boundary) {
|
||||
v_dst[i] = zero;
|
||||
} else if (vec_start >= boundary) {
|
||||
v_dst[i] = v_src[i];
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
v_dst[i] = Q6_V_vmux_QVV(mask, zero, v_src[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tail elements (row_elems not a multiple of VLEN_FP32)
|
||||
if (nloe > 0) {
|
||||
const uint32_t vec_start = nvec * VLEN_FP32;
|
||||
const uint32_t vec_end = vec_start + nloe;
|
||||
HVX_Vector tail_val;
|
||||
if (keep_left) {
|
||||
if (vec_end <= boundary) {
|
||||
tail_val = v_src[nvec];
|
||||
} else if (vec_start >= boundary) {
|
||||
tail_val = zero;
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
tail_val = Q6_V_vmux_QVV(mask, v_src[nvec], zero);
|
||||
}
|
||||
} else {
|
||||
if (vec_end <= boundary) {
|
||||
tail_val = zero;
|
||||
} else if (vec_start >= boundary) {
|
||||
tail_val = v_src[nvec];
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
tail_val = Q6_V_vmux_QVV(mask, zero, v_src[nvec]);
|
||||
}
|
||||
}
|
||||
hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void softplus_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
@@ -498,6 +586,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
case HTP_OP_L2_NORM:
|
||||
l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_TRI:
|
||||
tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -571,6 +662,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
case HTP_OP_L2_NORM:
|
||||
op_type = "l2norm-f32";
|
||||
break;
|
||||
case HTP_OP_TRI:
|
||||
op_type = "tri-f32";
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
@@ -640,6 +735,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
return err;
|
||||
}
|
||||
|
||||
int op_tri(struct htp_ops_context * octx) {
|
||||
int err = HTP_STATUS_OK;
|
||||
|
||||
switch (octx->src[0]->type) {
|
||||
case HTP_TYPE_F32:
|
||||
err = execute_op_unary_f32(octx);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = HTP_STATUS_NO_SUPPORT;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int op_unary(struct htp_ops_context * octx) {
|
||||
int err = HTP_STATUS_OK;
|
||||
|
||||
|
||||
@@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
||||
return &guid;
|
||||
}
|
||||
|
||||
struct ggml_backend_rpc_device_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
uint64_t last_graph_uid;
|
||||
};
|
||||
|
||||
struct ggml_backend_rpc_buffer_type_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
@@ -211,7 +219,6 @@ struct ggml_backend_rpc_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
uint64_t last_graph_uid;
|
||||
};
|
||||
|
||||
struct ggml_backend_rpc_buffer_context {
|
||||
@@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
|
||||
|
||||
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend);
|
||||
ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context;
|
||||
|
||||
GGML_ASSERT(cgraph->n_nodes > 0);
|
||||
bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid;
|
||||
bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid;
|
||||
if (reuse) {
|
||||
rpc_msg_graph_recompute_req request;
|
||||
request.device = rpc_ctx->device;
|
||||
@@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
} else {
|
||||
rpc_ctx->last_graph_uid = cgraph->uid;
|
||||
rpc_dev_ctx->last_graph_uid = cgraph->uid;
|
||||
std::vector<uint8_t> input;
|
||||
serialize_graph(rpc_ctx->device, cgraph, input);
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
@@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ device,
|
||||
/* .name = */ dev_name,
|
||||
/* .last_graph_uid = */ 0,
|
||||
};
|
||||
auto reg = ggml_backend_rpc_add_server(endpoint);
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
@@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir
|
||||
}
|
||||
}
|
||||
|
||||
// device interface
|
||||
|
||||
struct ggml_backend_rpc_device_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
@@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) {
|
||||
std::string dev_name = "RPC" + std::to_string(dev_id);
|
||||
std::string dev_desc = std::string(endpoint);
|
||||
ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ ind,
|
||||
/* .name = */ dev_name,
|
||||
/* .description = */ dev_desc
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ ind,
|
||||
/* .name = */ dev_name,
|
||||
/* .description = */ dev_desc,
|
||||
/* .last_graph_uid = */ 0,
|
||||
};
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
|
||||
@@ -72,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
int g_ggml_sycl_enable_level_zero = 0;
|
||||
int g_ggml_sycl_enable_flash_attention = 1;
|
||||
|
||||
@@ -304,6 +305,8 @@ static void ggml_check_sycl() try {
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
|
||||
@@ -319,11 +322,11 @@ static void ggml_check_sycl() try {
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||
#endif
|
||||
*/
|
||||
// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
|
||||
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
|
||||
// other places.
|
||||
// Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
|
||||
// staging path while preserving queue ordering semantics. Graph support still depends on the extension being
|
||||
// available, but it no longer needs to control the non-graph fast path.
|
||||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
|
||||
g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
|
||||
if (g_ggml_sycl_use_async_mem_op) {
|
||||
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
|
||||
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
|
||||
@@ -2385,21 +2388,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
||||
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10;
|
||||
const bool use_mkl_direct = gemm_flops < 256 * 256 * 256;
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_UNUSED(dst);
|
||||
|
||||
@@ -85,6 +85,32 @@ static __dpct_inline__ int get_int_from_uint8_aligned(
|
||||
(const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||
}
|
||||
|
||||
static __dpct_inline__ int byte_sub_4(const int a, const int b) {
|
||||
const uint32_t ua = static_cast<uint32_t>(a);
|
||||
const uint32_t ub = static_cast<uint32_t>(b);
|
||||
return static_cast<int>(((ua | 0x80808080u) - ub) ^ 0x80808080u);
|
||||
}
|
||||
|
||||
static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq_scalar(
|
||||
const int vl, const int vh, const int u0, const int u1, const int8_t sc0,
|
||||
const int8_t sc1, const float d, const float d80, const float d81) {
|
||||
static_assert(QR6_K == 2, "q6_K MMVQ scalar fast path assumes QR6_K == 2");
|
||||
|
||||
const int vil0 = (vl >> 0) & 0x0F0F0F0F;
|
||||
const int vih0 = ((vh >> 0) << 4) & 0x30303030;
|
||||
const int vi0 = byte_sub_4(vil0 | vih0, 0x20202020);
|
||||
|
||||
const int vil1 = (vl >> 4) & 0x0F0F0F0F;
|
||||
const int vih1 = ((vh >> 4) << 4) & 0x30303030;
|
||||
const int vi1 = byte_sub_4(vil1 | vih1, 0x20202020);
|
||||
|
||||
const float sumf =
|
||||
d80 * (dpct::dp4a(vi0, u0, 0) * sc0) +
|
||||
d81 * (dpct::dp4a(vi1, u1, 0) * sc1);
|
||||
|
||||
return d * sumf;
|
||||
}
|
||||
|
||||
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
||||
const uint8_t *values,
|
||||
int &val1, int &val2) {
|
||||
@@ -279,24 +305,8 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
|
||||
const int *__restrict__ u,
|
||||
const int8_t *__restrict__ scales, const float &d,
|
||||
const float *__restrict__ d8) {
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
const int sc = scales[4*i];
|
||||
|
||||
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
||||
|
||||
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
||||
|
||||
const int vi = dpct::vectorized_binary<sycl::char4>(
|
||||
(vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
|
||||
|
||||
sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
||||
}
|
||||
|
||||
return d*sumf;
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
|
||||
vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
|
||||
}
|
||||
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
@@ -542,23 +552,8 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
|
||||
__dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
|
||||
const int8_t * __restrict__ scales, const float d,
|
||||
const float * __restrict__ d8) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
const int sc = scales[4 * i];
|
||||
|
||||
const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
|
||||
|
||||
const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
|
||||
|
||||
const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
|
||||
dpct::sub_sat()); // vi = (vil | vih) - 32
|
||||
|
||||
sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
||||
}
|
||||
|
||||
return d * sumf;
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
|
||||
vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
|
||||
}
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
@@ -579,16 +574,15 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
|
||||
|
||||
const int8_t * scs = scales + scale_offset;
|
||||
|
||||
int u[QR6_K];
|
||||
float d8[QR6_K];
|
||||
const int u0 = get_int_from_int8_aligned(
|
||||
q8_1_quant_ptr + bq8_offset * QK8_1, iqs % QI8_1);
|
||||
const int u1 = get_int_from_int8_aligned(
|
||||
q8_1_quant_ptr + (bq8_offset + 2) * QK8_1, iqs % QI8_1);
|
||||
const float d80 = (*(q8_1_ds + bq8_offset + 0))[0];
|
||||
const float d81 = (*(q8_1_ds + bq8_offset + 2))[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
|
||||
const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
|
||||
d8[i] = ds_values[0];
|
||||
}
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
|
||||
vl, vh, u0, u1, scs[0], scs[4], *d, d80, d81);
|
||||
}
|
||||
};
|
||||
#define VDR_Q4_0_Q8_1_MMVQ 2
|
||||
@@ -1167,16 +1161,15 @@ vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
|
||||
|
||||
const int8_t * scales = bq6_K->scales + scale_offset;
|
||||
|
||||
int u[QR6_K];
|
||||
float d8[QR6_K];
|
||||
const int u0 = get_int_from_int8_aligned(
|
||||
bq8_1[bq8_offset + 0].qs, iqs % QI8_1);
|
||||
const int u1 = get_int_from_int8_aligned(
|
||||
bq8_1[bq8_offset + 2].qs, iqs % QI8_1);
|
||||
const float d80 = bq8_1[bq8_offset + 0].ds[0];
|
||||
const float d81 = bq8_1[bq8_offset + 2].ds[0];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
|
||||
}
|
||||
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
|
||||
vl, vh, u0, u1, scales[0], scales[4], bq6_K->d, d80, d81);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -8,6 +8,8 @@ endif()
|
||||
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
|
||||
find_package(SPIRV-Headers REQUIRED)
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
# Parallel build object files
|
||||
add_definitions(/MP)
|
||||
|
||||
@@ -49,7 +49,6 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <thread>
|
||||
@@ -760,8 +759,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_roll_f32;
|
||||
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
|
||||
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_cpy_transpose_16, pipeline_cpy_transpose_32;
|
||||
@@ -855,6 +854,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_ssm_scan_f32_d128;
|
||||
vk_pipeline pipeline_ssm_scan_f32_d256;
|
||||
vk_pipeline pipeline_ssm_conv_f32;
|
||||
vk_pipeline pipeline_ssm_conv_silu_f32;
|
||||
vk_pipeline pipeline_ssm_conv_bias_silu_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_opt_step_sgd_f32;
|
||||
std::map<vk_conv2d_pipeline_state, vk_pipeline> pipeline_conv2d_f32[CONV_SHAPE_COUNT];
|
||||
@@ -1353,6 +1354,8 @@ struct vk_op_rope_push_constants {
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t a_offset;
|
||||
uint32_t d_offset;
|
||||
};
|
||||
static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128");
|
||||
|
||||
@@ -4569,6 +4572,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f32, "cpy_f16_f32", cpy_f16_f32_len, cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_bf16,"cpy_f32_bf16",cpy_f32_bf16_len,cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_bf16_f32,"cpy_bf16_f32",cpy_bf16_f32_len,cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_i32_f32, "cpy_i32_f32", cpy_i32_f32_len, cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_i32, "cpy_f32_i32", cpy_f32_i32_len, cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -4577,6 +4581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f32, "contig_cpy_f16_f32", contig_cpy_f16_f32_len, contig_cpy_f16_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_bf16_f32,"contig_cpy_bf16_f32",contig_cpy_bf16_f32_len,contig_cpy_bf16_f32_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_i32_f32, "contig_cpy_i32_f32", contig_cpy_i32_f32_len, contig_cpy_i32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_i32, "contig_cpy_f32_i32", contig_cpy_f32_i32_len, contig_cpy_f32_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -4901,7 +4906,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
|
||||
}
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 3, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_silu_f32, "ssm_conv_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 0, 1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_bias_silu_f32, "ssm_conv_bias_silu_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 4, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16, 1, 1}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -7539,6 +7546,13 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_cpy_f32_bf16;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_BF16 && to == GGML_TYPE_F32) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_bf16_f32;
|
||||
} else {
|
||||
return ctx->device->pipeline_cpy_bf16_f32;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_I32) {
|
||||
if (contig) {
|
||||
return ctx->device->pipeline_contig_cpy_f32_i32;
|
||||
@@ -9937,7 +9951,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return nullptr;
|
||||
case GGML_OP_SSM_CONV:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_ssm_conv_f32;
|
||||
switch (ctx->num_additional_fused_ops) {
|
||||
case 0: return ctx->device->pipeline_ssm_conv_f32;
|
||||
case 1: return ctx->device->pipeline_ssm_conv_silu_f32;
|
||||
case 2: return ctx->device->pipeline_ssm_conv_bias_silu_f32;
|
||||
default: return nullptr;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
@@ -10118,6 +10137,15 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template<typename PC>
|
||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) {
|
||||
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
@@ -10878,11 +10906,28 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
|
||||
ggml_tensor * conv = cgraph->nodes[node_idx];
|
||||
const ggml_tensor * src0 = conv->src[0];
|
||||
const ggml_tensor * src1 = conv->src[1];
|
||||
|
||||
ggml_vk_op_f32<vk_op_ssm_conv_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SSM_CONV, {
|
||||
// Pick the destination tensor (last node in the fused chain) and the optional bias.
|
||||
// Fusion modes: 0 = ssm_conv, 1 = ssm_conv+silu, 2 = ssm_conv+add(bias)+silu.
|
||||
ggml_tensor * dst = conv;
|
||||
const ggml_tensor * bias = nullptr;
|
||||
|
||||
if (ctx->num_additional_fused_ops == 1) {
|
||||
dst = cgraph->nodes[node_idx + 1]; // silu
|
||||
} else if (ctx->num_additional_fused_ops == 2) {
|
||||
ggml_tensor * add = cgraph->nodes[node_idx + 1];
|
||||
bias = (add->src[0] == conv) ? add->src[1] : add->src[0];
|
||||
dst = cgraph->nodes[node_idx + 2]; // silu
|
||||
}
|
||||
|
||||
// The shader always declares 4 bindings; bind src0 as a dummy when bias isn't fused.
|
||||
const ggml_tensor * src2 = bias ? bias : src0;
|
||||
|
||||
ggml_vk_op_f32<vk_op_ssm_conv_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SSM_CONV, {
|
||||
(uint32_t)src0->nb[1], (uint32_t)src0->nb[2],
|
||||
(uint32_t)src1->nb[1],
|
||||
(uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2],
|
||||
@@ -11245,6 +11290,7 @@ static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *
|
||||
(uint32_t)src0->ne[2],
|
||||
nb01, nb02, nb03,
|
||||
nb11, nb12, nb13,
|
||||
0, 0, // a_offset, d_offset filled in by init_pushconst_tensor_offsets
|
||||
};
|
||||
|
||||
return rope;
|
||||
@@ -11340,6 +11386,11 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
GGML_ASSERT(buf[i] != nullptr);
|
||||
}
|
||||
|
||||
// a_offset is unused (the fused path reads from shared memory), but the rope/set_rows dst can be misaligned.
|
||||
// Round the binding offset down to the storage buffer alignment; the in-element shift goes in pc.rope.d_offset.
|
||||
pc.rope.d_offset = get_misalign_bytes(ctx, tensors[5]) / ggml_type_size(tensors[5]->type);
|
||||
offset[5] &= ~(size_t(ctx->device->properties.limits.minStorageBufferOffsetAlignment) - 1);
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] };
|
||||
|
||||
@@ -13557,7 +13608,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
break;
|
||||
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_vk_ssm_conv(ctx, compute_ctx, node);
|
||||
ggml_vk_ssm_conv(ctx, compute_ctx, cgraph, node_idx);
|
||||
|
||||
break;
|
||||
|
||||
@@ -14454,6 +14505,62 @@ static bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct g
|
||||
return true;
|
||||
}
|
||||
|
||||
// Match SSM_CONV + UNARY(SILU) or SSM_CONV + ADD + UNARY(SILU). num_extra is 1 or 2.
|
||||
static bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
|
||||
int node_idx, int num_extra) {
|
||||
const ggml_tensor * conv = cgraph->nodes[node_idx];
|
||||
if (conv->op != GGML_OP_SSM_CONV) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ggml_tensor * silu = nullptr;
|
||||
const ggml_tensor * bias = nullptr;
|
||||
|
||||
if (num_extra == 1) {
|
||||
if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_UNARY })) {
|
||||
return false;
|
||||
}
|
||||
silu = cgraph->nodes[node_idx + 1];
|
||||
} else if (num_extra == 2) {
|
||||
if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY })) {
|
||||
return false;
|
||||
}
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
|
||||
silu = cgraph->nodes[node_idx + 2];
|
||||
bias = (add->src[0] == conv) ? add->src[1] : add->src[0];
|
||||
|
||||
if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) {
|
||||
return false;
|
||||
}
|
||||
// bias must be channel-wise (one element per channel of the conv output)
|
||||
if (ggml_nelements(bias) != conv->ne[0] || bias->ne[0] != conv->ne[0]) {
|
||||
return false;
|
||||
}
|
||||
if (add->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
// The shader doesn't apply per-tensor offsets, so reject misaligned bias.
|
||||
if (get_misalign_bytes(ctx, bias) != 0) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ggml_get_unary_op(silu) != GGML_UNARY_OP_SILU) {
|
||||
return false;
|
||||
}
|
||||
if (conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
// The shader writes to the fused dst using its own strides, but the push constants don't
|
||||
// carry a per-tensor offset, so the binding must be naturally aligned.
|
||||
if (get_misalign_bytes(ctx, silu) != 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
|
||||
int node_idx, topk_moe_mode mode) {
|
||||
|
||||
@@ -14870,6 +14977,19 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
// they are overwritten, and one workgroup per row. So close enough.
|
||||
op_srcs_fused_elementwise[0] = true;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
} else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 2)) {
|
||||
ctx->num_additional_fused_ops = 2;
|
||||
fusion_string = "SSM_CONV_BIAS_SILU";
|
||||
// ssm_conv reads multiple input tokens per output, so it's not elementwise w.r.t. its srcs.
|
||||
// The downstream add and silu are elementwise on the conv output.
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
op_srcs_fused_elementwise[2] = true;
|
||||
} else if (ggml_vk_can_fuse_ssm_conv(ctx, cgraph, i, 1)) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
fusion_string = "SSM_CONV_SILU";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
|
||||
ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
|
||||
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
|
||||
@@ -15201,7 +15321,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) &&
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID) &&
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL) &&
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD)) {
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD) &&
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_ADD) &&
|
||||
!(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_SSM_CONV && graph->nodes[j]->op == GGML_OP_UNARY)) {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
@@ -15284,6 +15406,19 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
}
|
||||
}
|
||||
}
|
||||
// SSM_CONV + ADD + UNARY: pull the consuming UNARY forward
|
||||
if (j > 0 &&
|
||||
graph->nodes[j]->op == GGML_OP_ADD &&
|
||||
graph->nodes[j-1]->op == GGML_OP_SSM_CONV) {
|
||||
for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) {
|
||||
if (graph->nodes[k]->op == GGML_OP_UNARY &&
|
||||
graph->nodes[k]->src[0] == graph->nodes[j]) {
|
||||
current_set.push_back(k);
|
||||
used[k] = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Second pass grabs view nodes.
|
||||
@@ -15848,6 +15983,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
if (src1_type == GGML_TYPE_F32) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
||||
@@ -19,7 +19,9 @@ void main() {
|
||||
if (idx + (num_iter-1)*num_threads < p.ne) {
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
|
||||
#if defined(DATA_D_BF16)
|
||||
#if defined(DATA_A_BF16)
|
||||
data_d[get_doffset() + idx] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + idx])));
|
||||
#elif defined(DATA_D_BF16)
|
||||
float f = float(data_a[get_aoffset() + idx]);
|
||||
data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f));
|
||||
#elif !defined(OPTIMIZATION_ERROR_WORKAROUND)
|
||||
@@ -35,7 +37,9 @@ void main() {
|
||||
continue;
|
||||
}
|
||||
|
||||
#if defined(DATA_D_BF16)
|
||||
#if defined(DATA_A_BF16)
|
||||
data_d[get_doffset() + idx] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + idx])));
|
||||
#elif defined(DATA_D_BF16)
|
||||
float f = float(data_a[get_aoffset() + idx]);
|
||||
data_d[get_doffset() + idx] = D_TYPE(fp32_to_bf16(f));
|
||||
#elif !defined(OPTIMIZATION_ERROR_WORKAROUND)
|
||||
|
||||
@@ -12,7 +12,9 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(DATA_D_BF16)
|
||||
#if defined(DATA_A_BF16)
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(bf16_to_fp32(uint32_t(data_a[get_aoffset() + src0_idx(idx)])));
|
||||
#elif defined(DATA_D_BF16)
|
||||
float f = float(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(fp32_to_bf16(f));
|
||||
#elif !defined(OPTIMIZATION_ERROR_WORKAROUND)
|
||||
|
||||
@@ -9,7 +9,7 @@ uint rope_a_coord(const uint i0, const uint i01, const uint i02, const uint i03,
|
||||
// Per-row offset in shared memory
|
||||
const uint ix = i0;
|
||||
#else
|
||||
const uint ix = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
|
||||
const uint ix = p.a_offset + i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
|
||||
#endif
|
||||
return ix;
|
||||
}
|
||||
@@ -48,6 +48,7 @@ void rope_norm(const uint i0, const uint i1, const uint i2, const uint i3, rope_
|
||||
idst = i1*p.nb11 + i0;
|
||||
idst += rope_data_i[i2].x * p.set_rows_stride;
|
||||
}
|
||||
idst += p.d_offset;
|
||||
|
||||
if (i0 >= p.n_dims) {
|
||||
rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]);
|
||||
@@ -84,6 +85,7 @@ void rope_neox(const uint i0, const uint i1, const uint i2, const uint i3, rope_
|
||||
idst = i1*p.nb11 + i0/2;
|
||||
idst += rope_data_i[i2].x * p.set_rows_stride;
|
||||
}
|
||||
idst += p.d_offset;
|
||||
|
||||
if (i0 >= p.n_dims) {
|
||||
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
|
||||
@@ -121,6 +123,7 @@ void rope_multi(const uint i0, const uint i1, const uint i2, const uint i3, rope
|
||||
idst = i1*p.nb11 + i0/2;
|
||||
idst += rope_data_i[i2].x * p.set_rows_stride;
|
||||
}
|
||||
idst += p.d_offset;
|
||||
|
||||
if (i0 >= p.n_dims) {
|
||||
rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
|
||||
@@ -176,7 +179,7 @@ void rope_vision(const uint i0, const uint i1, const uint i2, const uint i3, rop
|
||||
return;
|
||||
}
|
||||
|
||||
const uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
|
||||
const uint idst = p.d_offset + i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
|
||||
const uint ix = rope_a_coord(i0/2, i1, i2, i3, p);
|
||||
|
||||
const int sect_dims = p.sections[0] + p.sections[1];
|
||||
|
||||
@@ -26,6 +26,9 @@ struct rope_params {
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
|
||||
uint a_offset;
|
||||
uint d_offset;
|
||||
};
|
||||
|
||||
#endif // !defined(GGML_ROPE_PARAMS)
|
||||
|
||||
@@ -6,12 +6,15 @@
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout(constant_id = 1) const uint TOKENS_PER_WG = 16;
|
||||
layout(constant_id = 2) const bool APPLY_BIAS = false;
|
||||
layout(constant_id = 3) const bool APPLY_SILU = false;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src0 { float src0[]; };
|
||||
layout(binding = 1) readonly buffer Src1 { float src1[]; };
|
||||
layout(binding = 2) buffer Dst { float dst[]; };
|
||||
layout(binding = 2) readonly buffer Bias { float bias[]; };
|
||||
layout(binding = 3) buffer Dst { float dst[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint nb01; uint nb02;
|
||||
@@ -45,6 +48,13 @@ void main() {
|
||||
}
|
||||
}
|
||||
|
||||
if (APPLY_BIAS) {
|
||||
sum += bias[i1];
|
||||
}
|
||||
if (APPLY_SILU) {
|
||||
sum = sum / (1.0f + exp(-sum));
|
||||
}
|
||||
|
||||
const uint dst_idx = i3 * (dst_nb2 / 4) + i2 * (dst_nb1 / 4) + i1;
|
||||
dst[dst_idx] = sum;
|
||||
}
|
||||
|
||||
@@ -731,6 +731,7 @@ void process_shaders() {
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("cpy_f16_f32", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("cpy_f32_bf16","copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
|
||||
string_to_spv("cpy_bf16_f32","copy.comp", {{"A_TYPE", "uint16_t"}, {"D_TYPE", "float"}, {"DATA_A_BF16", "1"}});
|
||||
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("contig_cpy_f32_i32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
|
||||
string_to_spv("contig_cpy_i32_f32", "contig_copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
@@ -738,6 +739,7 @@ void process_shaders() {
|
||||
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f16_f32", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f32_bf16","contig_copy.comp",{{"A_TYPE", "float"}, {"D_TYPE", "uint16_t"}, {"DATA_D_BF16", "1"}});
|
||||
string_to_spv("contig_cpy_bf16_f32","contig_copy.comp",{{"A_TYPE", "uint16_t"}, {"D_TYPE", "float"}, {"DATA_A_BF16", "1"}});
|
||||
string_to_spv("cpy_f32_i32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "int"}});
|
||||
string_to_spv("cpy_i32_f32", "copy.comp", {{"A_TYPE", "int"}, {"D_TYPE", "float"}});
|
||||
|
||||
|
||||
@@ -1234,6 +1234,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
|
||||
const uint32_t h = (uint32_t) src2->ne[1];
|
||||
const uint32_t n_tokens = (uint32_t) src2->ne[2];
|
||||
const uint32_t n_seqs = (uint32_t) src2->ne[3];
|
||||
const uint32_t K = (uint32_t) src5->ne[1];
|
||||
const float scale = 1.0f / sqrtf((float) s_v);
|
||||
uint32_t scale_u32;
|
||||
memcpy(&scale_u32, &scale, sizeof(scale_u32));
|
||||
@@ -1258,6 +1259,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
|
||||
|
||||
(uint32_t) src0->ne[1],
|
||||
(uint32_t) (src2->ne[3] / src0->ne[3]),
|
||||
K,
|
||||
scale_u32,
|
||||
};
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ struct Params {
|
||||
|
||||
neq1: u32,
|
||||
rq3: u32,
|
||||
K: u32,
|
||||
scale: f32,
|
||||
};
|
||||
|
||||
@@ -62,11 +63,14 @@ fn main(
|
||||
let iq3 = seq_id / params.rq3;
|
||||
|
||||
let state_size = S_V * S_V;
|
||||
let state_base = (seq_id * params.h + head_id) * state_size;
|
||||
let state_in_base = (seq_id * params.K * params.h + head_id) * state_size;
|
||||
let state_out_base = (seq_id * params.h + head_id) * state_size;
|
||||
let state_size_per_snap = state_size * params.h * params.n_seqs;
|
||||
let shift = i32(params.n_tokens) - i32(params.K);
|
||||
|
||||
var state: array<f32, S_V>;
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
state[i] = src_state[state_base + col * S_V + i];
|
||||
state[i] = src_state[state_in_base + col * S_V + i];
|
||||
}
|
||||
|
||||
var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V;
|
||||
@@ -123,10 +127,22 @@ fn main(
|
||||
dst[attn_off + col] = attn_col * params.scale;
|
||||
attn_off += S_V * params.h;
|
||||
|
||||
if (params.K > 1u) {
|
||||
let target_slot = i32(t) - shift;
|
||||
if (target_slot >= 0 && target_slot < i32(params.K)) {
|
||||
let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base;
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
dst[slot_base + col * S_V + i] = state[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
dst[params.s_off + state_base + col * S_V + i] = state[i];
|
||||
if (params.K == 1u) {
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
dst[params.s_off + state_out_base + col * S_V + i] = state[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ dir=$(basename $(pwd))
|
||||
# sanitize branch name for directory name (replace / with -)
|
||||
dir_suffix=$(echo "$BRANCH" | tr '/' '-')
|
||||
|
||||
git worktree add -b "$BRANCH" "../$dir-$dir_suffix" HEAD
|
||||
git worktree add "../$dir-$dir_suffix" "$BRANCH" || git worktree add -b "$BRANCH" "../$dir-$dir_suffix" HEAD
|
||||
|
||||
og_path=$(pwd)
|
||||
wt_path=$(cd "../$dir-$dir_suffix" && pwd)
|
||||
|
||||
@@ -64,8 +64,9 @@ llama_context::llama_context(
|
||||
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.embeddings_pre_norm = false;
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.embeddings_pre_norm = false;
|
||||
cparams.embeddings_pre_norm_masked = false;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
@@ -895,8 +896,17 @@ float * llama_context::get_embeddings_pre_norm_ith(int32_t i) {
|
||||
throw std::runtime_error("no pre-norm embeddings");
|
||||
}
|
||||
|
||||
const int64_t j = output_resolve_row(i);
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
if (!cparams.embeddings_pre_norm_masked) {
|
||||
// unmasked: pre-norm rows are stored densely, indexed by raw token position.
|
||||
if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) {
|
||||
throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd));
|
||||
}
|
||||
return embd_pre_norm.data + (size_t) i * n_embd;
|
||||
}
|
||||
|
||||
const int64_t j = output_resolve_row(i);
|
||||
return embd_pre_norm.data + j*n_embd;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
@@ -1088,10 +1098,11 @@ void llama_context::set_embeddings(bool value) {
|
||||
//sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_embeddings_pre_norm(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
void llama_context::set_embeddings_pre_norm(bool value, bool masked) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked);
|
||||
|
||||
cparams.embeddings_pre_norm = value;
|
||||
cparams.embeddings_pre_norm = value;
|
||||
cparams.embeddings_pre_norm_masked = masked;
|
||||
}
|
||||
|
||||
void llama_context::set_causal_attn(bool value) {
|
||||
@@ -1737,6 +1748,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
};
|
||||
|
||||
int64_t n_outputs_prev = 0;
|
||||
int64_t n_tokens_prev = 0;
|
||||
|
||||
do {
|
||||
const auto & ubatch = mctx->get_ubatch();
|
||||
@@ -1882,16 +1894,21 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// extract pre-norm embeddings (hidden state before the final output norm)
|
||||
// only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
|
||||
if (embd_pre_norm.data && t_h_pre_norm && n_outputs > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
|
||||
GGML_ASSERT(backend_h != nullptr);
|
||||
{
|
||||
const bool masked = cparams.embeddings_pre_norm_masked;
|
||||
const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens;
|
||||
const int64_t offset = masked ? n_outputs_prev : n_tokens_prev;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
float * embd_pre_norm_out = embd_pre_norm.data + n_outputs_prev*n_embd;
|
||||
if (embd_pre_norm.data && t_h_pre_norm && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
|
||||
GGML_ASSERT(backend_h != nullptr);
|
||||
|
||||
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_pre_norm.size);
|
||||
ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_outputs*n_embd*sizeof(float));
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd;
|
||||
|
||||
GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size);
|
||||
ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_rows*n_embd*sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
// Copy backend sampling output if this ubatch produced any sampling tensors.
|
||||
@@ -1908,6 +1925,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
|
||||
n_outputs_prev += n_outputs;
|
||||
n_tokens_prev += ubatch.n_tokens;
|
||||
} while (mctx->next());
|
||||
|
||||
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
||||
@@ -1999,6 +2017,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0;
|
||||
|
||||
if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) {
|
||||
// unmasked: pre-norm row exists for every token in the batch, not just
|
||||
// those flagged via batch.logits[i] -> size by token count instead.
|
||||
embd_pre_norm.size = (size_t) n_embd * n_batch;
|
||||
}
|
||||
|
||||
// Allocate backend sampling output buffers if there are backend samplers configured.
|
||||
const bool has_sampling = !sampling.samplers.empty();
|
||||
if (has_sampling) {
|
||||
@@ -3547,8 +3571,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
|
||||
return ctx->get_embeddings_seq(seq_id);
|
||||
}
|
||||
|
||||
void llama_set_embeddings_pre_norm(llama_context * ctx, bool value) {
|
||||
ctx->set_embeddings_pre_norm(value);
|
||||
void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked) {
|
||||
ctx->set_embeddings_pre_norm(value, masked);
|
||||
}
|
||||
|
||||
float * llama_get_embeddings_pre_norm(llama_context * ctx) {
|
||||
|
||||
@@ -110,7 +110,7 @@ struct llama_context {
|
||||
void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
|
||||
|
||||
void set_embeddings (bool value);
|
||||
void set_embeddings_pre_norm(bool value);
|
||||
void set_embeddings_pre_norm(bool value, bool masked);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
|
||||
@@ -28,7 +28,8 @@ struct llama_cparams {
|
||||
float yarn_beta_slow;
|
||||
|
||||
bool embeddings;
|
||||
bool embeddings_pre_norm; // also extract the hidden state before the final output norm
|
||||
bool embeddings_pre_norm; // also extract the hidden state before the final output norm
|
||||
bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0
|
||||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
|
||||
@@ -93,14 +93,14 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c
|
||||
// pre-norm embeddings (hidden state before the final output norm)
|
||||
//
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||
LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value);
|
||||
// Set whether the context outputs pre-norm embeddings or not
|
||||
// If masked == true, output the embeddings only for the tokens with batch.logits != 0
|
||||
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
|
||||
LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked);
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
LLAMA_API float * llama_get_embeddings_pre_norm(struct llama_context * ctx);
|
||||
LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx);
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
@@ -848,6 +848,9 @@ void llm_graph_result::set_outputs() {
|
||||
if (t_embd_pooled != nullptr) {
|
||||
ggml_set_output(t_embd_pooled);
|
||||
}
|
||||
if (t_h_pre_norm != nullptr) {
|
||||
ggml_set_output(t_h_pre_norm);
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
|
||||
@@ -581,7 +581,8 @@ struct llm_graph_params {
|
||||
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
||||
(
|
||||
(!ubatch.token && !other.ubatch.token) ||
|
||||
(!ubatch.embd && !other.ubatch.embd)
|
||||
(!ubatch.embd && !other.ubatch.embd) ||
|
||||
(ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd)
|
||||
);
|
||||
|
||||
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
||||
|
||||
@@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
if (mem_recr->n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
if (mem_recr->n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -416,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
if (n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -72,6 +72,7 @@ public:
|
||||
|
||||
// number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups
|
||||
uint32_t n_rs_seq = 0;
|
||||
|
||||
// per-seq rollback index
|
||||
std::vector<uint32_t> rs_idx;
|
||||
|
||||
|
||||
@@ -447,13 +447,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
return build_delta_net_chunking(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
bool llm_build_delta_net_base::keep_rs() const {
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
return cparams.n_rs_seq > 0
|
||||
&& n_seq_tokens > 1
|
||||
&& (uint32_t) n_seq_tokens <= 1 + cparams.n_rs_seq;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * conv_states_all,
|
||||
@@ -461,12 +454,12 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
int64_t conv_kernel_size,
|
||||
int64_t conv_channels,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
const uint32_t mem_size = mctx_cur->get_size();
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
const bool keep = keep_rs();
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
const auto mem_size = mctx_cur->get_size();
|
||||
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
cb(conv_states, "conv_states", il);
|
||||
@@ -480,32 +473,52 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
|
||||
cb(conv_input, "conv_input", il);
|
||||
|
||||
if (!keep) {
|
||||
ggml_tensor * last_conv_states =
|
||||
ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
|
||||
conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
|
||||
cb(last_conv_states, "last_conv_states", il);
|
||||
const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
|
||||
ggml_tensor * state_update_target =
|
||||
ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1],
|
||||
kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
|
||||
cb(state_update_target, "state_update_target", il);
|
||||
const size_t row_size = ggml_row_size(conv_states_all->type, row_count);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
|
||||
if (cparams.n_rs_seq == 0) {
|
||||
const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0];
|
||||
const int64_t s_slot = 0;
|
||||
|
||||
ggml_tensor * conv_state_last =
|
||||
ggml_view_3d(ctx0, conv_input,
|
||||
conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
ggml_row_size(conv_input->type, s_idx));
|
||||
cb(conv_state_last, "conv_state_last", il);
|
||||
|
||||
ggml_tensor * conv_state_update =
|
||||
ggml_view_2d(ctx0, conv_states_all,
|
||||
row_count, n_seqs, conv_states_all->nb[1],
|
||||
(s_slot * mem_size + kv_head) * row_size);
|
||||
cb(conv_state_update, "conv_state_update", il);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
|
||||
} else {
|
||||
const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
const size_t row_size = row_count * ggml_element_size(conv_states_all);
|
||||
for (int64_t t = 1; t <= n_seq_tokens; ++t) {
|
||||
const uint32_t slot = (uint32_t)(n_seq_tokens - t);
|
||||
ggml_tensor * src =
|
||||
ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
t * ggml_element_size(conv_input));
|
||||
ggml_tensor * dst =
|
||||
ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs,
|
||||
conv_states_all->nb[1],
|
||||
((size_t) slot * mem_size + kv_head) * row_size);
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are
|
||||
// inside the same ubatch. currently with `split_equal()` this is not correct
|
||||
|
||||
const int64_t K = (int64_t) cparams.n_rs_seq + 1;
|
||||
|
||||
for (int64_t t = 1; t <= K; ++t) {
|
||||
const int64_t s_idx = std::max<int64_t>(0, conv_input->ne[0] - conv_states->ne[0] - K + t);
|
||||
const int64_t s_slot = K - t;
|
||||
|
||||
ggml_tensor * conv_state_last =
|
||||
ggml_view_3d(ctx0, conv_input,
|
||||
conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
ggml_row_size(conv_input->type, s_idx));
|
||||
|
||||
ggml_tensor * conv_state_update =
|
||||
ggml_view_2d(ctx0,
|
||||
conv_states_all, row_count, n_seqs,
|
||||
conv_states_all->nb[1],
|
||||
(s_slot * mem_size + kv_head) * row_size);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -531,7 +544,9 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
const int64_t n_seqs = s->ne[3];
|
||||
const int64_t n_seq_tokens = q->ne[2];
|
||||
|
||||
if (!keep_rs()) {
|
||||
const bool keep = cparams.n_rs_seq > 0;
|
||||
|
||||
if (!keep) {
|
||||
auto attn_out = build_delta_net(q, k, v, g, b, s, il);
|
||||
ggml_tensor * output = attn_out.first;
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
@@ -554,7 +569,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0);
|
||||
|
||||
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d);
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
if (n_seq_tokens > 1) {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
} else {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
}
|
||||
|
||||
const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs;
|
||||
const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs;
|
||||
@@ -576,9 +595,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
ggml_row_size(gdn_out->type, S_v * S_v),
|
||||
ggml_row_size(gdn_out->type, S_v * S_v * H_v),
|
||||
ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap));
|
||||
|
||||
ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all,
|
||||
hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
|
||||
((size_t) cache_slot * mem_size + kv_head) * row_size);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
|
||||
}
|
||||
|
||||
|
||||
@@ -66,9 +66,6 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
|
||||
// true when speculative rollback is enabled and the batch fits in the rs cache
|
||||
bool keep_rs() const;
|
||||
|
||||
// read conv state from cache, concat with qkv_mixed, write back (single slot or per-token)
|
||||
// qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs)
|
||||
ggml_tensor * build_conv_state(
|
||||
|
||||
@@ -176,7 +176,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
|
||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
|
||||
}
|
||||
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
@@ -211,6 +211,10 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
|
||||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
}
|
||||
|
||||
// Final norm
|
||||
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
|
||||
|
||||
|
||||
@@ -199,7 +199,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
|
||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il);
|
||||
}
|
||||
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
@@ -234,6 +234,10 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
|
||||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
if (!cparams.embeddings_pre_norm_masked && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
}
|
||||
|
||||
// Final norm
|
||||
cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
|
||||
|
||||
|
||||
@@ -4851,6 +4851,21 @@ struct test_rope : public test_case {
|
||||
|
||||
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
||||
ggml_set_name(a, "view_of_a");
|
||||
} else if (v == 2) {
|
||||
// second-half slice along dim 0 (mimics build_rope_2d in clip.cpp).
|
||||
// The non-zero view offset (ne_a[0] * elem_size) often produces a
|
||||
// non-aligned buffer offset, which exercises backends' alignment paths.
|
||||
auto ne = ne_a; ne[0] *= 2;
|
||||
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
if (forward) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3],
|
||||
a->nb[1], a->nb[2], a->nb[3],
|
||||
ne_a[0] * ggml_element_size(a));
|
||||
ggml_set_name(a, "view_of_a");
|
||||
} else {
|
||||
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
if (forward) {
|
||||
@@ -4913,8 +4928,6 @@ struct test_rope : public test_case {
|
||||
} else {
|
||||
out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||
}
|
||||
|
||||
// TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
|
||||
}
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
@@ -8687,6 +8700,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
|
||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, GGML_ROPE_TYPE_NEOX, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
|
||||
}
|
||||
|
||||
// build_rope_2d-style: ROPE on a non-contiguous view
|
||||
// that starts at a non-zero offset along dim 0
|
||||
// (e.g. gemma4v vision second-half view).
|
||||
for (int rmode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION }) {
|
||||
test_cases.emplace_back(new test_rope(type, { 36, 16, 2457, 1}, 36, rmode, 512, fs, ef, af, ff, 2, fw));
|
||||
}
|
||||
}
|
||||
|
||||
all = false;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -191,10 +191,10 @@
|
||||
| `--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) |
|
||||
| `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
|
||||
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
|
||||
@@ -183,6 +183,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
||||
| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
|
||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
|
||||
@@ -244,10 +245,10 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) |
|
||||
| `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
|
||||
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
|
||||
@@ -1032,23 +1032,33 @@ json oaicompat_chat_params_parse(
|
||||
auto caps = common_chat_templates_get_caps(opt.tmpls.get());
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
||||
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
||||
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
|
||||
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
||||
inputs.grammar = grammar;
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
const bool continue_final_message = json_value(body, "continue_final_message", false);
|
||||
if (continue_final_message && inputs.add_generation_prompt) {
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
||||
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
||||
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
|
||||
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
||||
inputs.grammar = grammar;
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
inputs.continue_final_message = body.contains("continue_final_message") ?
|
||||
common_chat_continuation_parse(body.at("continue_final_message")) :
|
||||
COMMON_CHAT_CONTINUATION_NONE;
|
||||
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_NONE && opt.prefill_assistant
|
||||
&& !inputs.messages.empty() && inputs.messages.back().role == "assistant") {
|
||||
if (inputs.messages.size() >= 2 && inputs.messages[inputs.messages.size() - 2].role == "assistant") {
|
||||
throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
|
||||
}
|
||||
inputs.continue_final_message = COMMON_CHAT_CONTINUATION_AUTO;
|
||||
inputs.add_generation_prompt = false;
|
||||
}
|
||||
if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && inputs.add_generation_prompt) {
|
||||
throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true.");
|
||||
}
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
if (body.contains("reasoning_format")) {
|
||||
inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
|
||||
}
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
if (body.contains("grammar")) {
|
||||
throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
|
||||
@@ -1073,84 +1083,11 @@ json oaicompat_chat_params_parse(
|
||||
throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
|
||||
}
|
||||
|
||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||
// for ex. this can be useful to modify the reasoning process in reasoning models
|
||||
// continue_final_message is the explicit opt in alias from the vLLM/transformers API,
|
||||
// equivalent to the prefill_assistant heuristic
|
||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"
|
||||
&& (continue_final_message || opt.prefill_assistant);
|
||||
common_chat_msg last_message;
|
||||
if (prefill_assistant_message) {
|
||||
last_message = inputs.messages.back();
|
||||
inputs.messages.pop_back();
|
||||
|
||||
/* sanity check, max one assistant message at the end of the list */
|
||||
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
|
||||
throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
|
||||
}
|
||||
|
||||
// reject reasoning prefill on channel based templates that do not expose explicit thinking tags
|
||||
if (!last_message.reasoning_content.empty() && inputs.enable_thinking) {
|
||||
auto probe_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
|
||||
if (probe_params.supports_thinking && probe_params.thinking_end_tag.empty()) {
|
||||
throw std::invalid_argument("Assistant prefill with reasoning_content is not supported yet for this template.");
|
||||
}
|
||||
}
|
||||
|
||||
inputs.add_generation_prompt = true;
|
||||
}
|
||||
inputs.force_pure_content = opt.force_pure_content;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
|
||||
|
||||
/* Append assistant prefilled message */
|
||||
if (prefill_assistant_message) {
|
||||
const bool thinking_active = chat_params.supports_thinking && !chat_params.thinking_end_tag.empty();
|
||||
const bool has_reasoning = !last_message.reasoning_content.empty();
|
||||
const bool has_content = !last_message.content.empty() || !last_message.content_parts.empty();
|
||||
const bool mid_reasoning = has_reasoning && !has_content;
|
||||
|
||||
// some templates inject thinking_start in generation_prompt, others let the model emit it
|
||||
const bool gp_has_think = thinking_active
|
||||
&& chat_params.generation_prompt.find(chat_params.thinking_start_tag) != std::string::npos;
|
||||
|
||||
// open the thinking block when reasoning is present and the template did not inject it
|
||||
if (has_reasoning) {
|
||||
if (thinking_active && !gp_has_think) {
|
||||
chat_params.prompt += chat_params.thinking_start_tag;
|
||||
}
|
||||
chat_params.prompt += last_message.reasoning_content;
|
||||
}
|
||||
|
||||
if (thinking_active) {
|
||||
if (mid_reasoning) {
|
||||
// model continues inside the thinking block, keep generation_prompt open on think
|
||||
if (!gp_has_think) {
|
||||
chat_params.generation_prompt += chat_params.thinking_start_tag;
|
||||
}
|
||||
} else {
|
||||
// close thinking block when reasoning is followed by content, or when the template forced it open
|
||||
if (has_reasoning || gp_has_think) {
|
||||
chat_params.prompt += chat_params.thinking_end_tag;
|
||||
}
|
||||
// strip thinking_start from generation_prompt so the parser routes model output as content
|
||||
auto pos = chat_params.generation_prompt.rfind(chat_params.thinking_start_tag);
|
||||
if (pos != std::string::npos) {
|
||||
chat_params.generation_prompt = chat_params.generation_prompt.substr(0, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!last_message.content_parts.empty()) {
|
||||
for (auto & p : last_message.content_parts) {
|
||||
chat_params.prompt += p.text;
|
||||
}
|
||||
} else {
|
||||
chat_params.prompt += last_message.content;
|
||||
}
|
||||
}
|
||||
|
||||
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
||||
llama_params["prompt"] = chat_params.prompt;
|
||||
if (!chat_params.grammar.empty()) {
|
||||
|
||||
@@ -243,6 +243,11 @@ struct server_slot {
|
||||
return task->need_embd() || (spec && common_speculative_need_embd(spec));
|
||||
}
|
||||
|
||||
bool need_embd_pre_norm() const {
|
||||
GGML_ASSERT(task);
|
||||
return spec && common_speculative_need_embd_pre_norm(spec);
|
||||
}
|
||||
|
||||
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||
// also we cannot split if the pooling would require any past tokens
|
||||
// (MTP supports splitting — uses task->need_embd() not need_embd())
|
||||
@@ -462,20 +467,26 @@ struct server_slot {
|
||||
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
SLT_INF(*this,
|
||||
"\n"
|
||||
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second);
|
||||
|
||||
SLT_INF(*this,
|
||||
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
t_token_generation, n_decoded, t_gen, n_gen_second);
|
||||
|
||||
SLT_INF(*this,
|
||||
" total time = %10.2f ms / %5d tokens\n",
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
||||
t_token_generation, n_decoded, t_gen, n_gen_second,
|
||||
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
||||
|
||||
SLT_INF(*this,
|
||||
" graphs reused = %10d\n",
|
||||
llama_perf_context(ctx_tgt).n_reused);
|
||||
|
||||
if (n_draft_total > 0) {
|
||||
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
||||
SLT_CNT(*this,
|
||||
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total
|
||||
);
|
||||
SLT_INF(*this,
|
||||
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total);
|
||||
}
|
||||
|
||||
common_speculative_print_stats(spec);
|
||||
@@ -2578,9 +2589,9 @@ private:
|
||||
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
|
||||
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, pos_next - n_swa);
|
||||
const auto pos_min_thold = std::max(0, pos_next - n_swa - 1);
|
||||
|
||||
if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
|
||||
if (n_past > 0 && n_past <= slot.prompt.n_tokens()) {
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
|
||||
if (pos_min == -1) {
|
||||
SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
|
||||
@@ -3880,6 +3891,7 @@ void server_routes::init_routes() {
|
||||
{ "eos_token", meta->eos_token_str },
|
||||
{ "build_info", meta->build_info },
|
||||
{ "is_sleeping", queue_tasks.is_sleeping() },
|
||||
{ "cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy },
|
||||
};
|
||||
if (params.use_jinja) {
|
||||
if (!tmpl_tools.empty()) {
|
||||
@@ -4527,7 +4539,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
|
||||
}
|
||||
}
|
||||
|
||||
int embd_normalize = 2; // default to Euclidean/L2 norm
|
||||
int embd_normalize = params.embd_normalize;
|
||||
if (body.count("embd_normalize") != 0) {
|
||||
embd_normalize = body.at("embd_normalize");
|
||||
if (meta->pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
|
||||
@@ -231,11 +231,10 @@ bool server_http_context::init(const common_params & params) {
|
||||
};
|
||||
|
||||
auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
|
||||
(void)req; // suppress unused parameter warning when LLAMA_BUILD_UI / LLAMA_BUILD_WEBUI is not defined
|
||||
(void)req; // suppress unused parameter warning when LLAMA_BUILD_UI is not defined
|
||||
bool ready = is_ready.load();
|
||||
if (!ready) {
|
||||
// Support both old and new preprocessor defines
|
||||
#if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI)
|
||||
#if defined(LLAMA_BUILD_UI)
|
||||
auto tmp = string_split<std::string>(req.path, '.');
|
||||
if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) {
|
||||
res.status = 503;
|
||||
@@ -313,8 +312,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
// Support both old and new preprocessor defines
|
||||
#if defined(LLAMA_BUILD_UI) || defined(LLAMA_BUILD_WEBUI)
|
||||
#if defined(LLAMA_BUILD_UI)
|
||||
// using embedded static index.html
|
||||
srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) {
|
||||
// COEP and COOP headers, required by pyodide (python interpreter)
|
||||
|
||||
@@ -798,9 +798,10 @@ void server_models::load(const std::string & name) {
|
||||
std::thread log_thread([&]() {
|
||||
// read stdout/stderr and forward to main server log
|
||||
// also handle status report from child process
|
||||
std::vector<char> vec_buf(128 * 1024); // large buffer for storing info
|
||||
char * buffer = vec_buf.data();
|
||||
if (stdout_file) {
|
||||
char buffer[128 * 1024]; // large buffer for storing info
|
||||
while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
|
||||
while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
std::string str(buffer);
|
||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
||||
@@ -1164,6 +1165,7 @@ void server_models_routes::init_routes() {
|
||||
// Deprecated: use ui_settings instead (kept for backward compat)
|
||||
{"webui_settings", webui_settings},
|
||||
{"build_info", std::string(llama_build_info())},
|
||||
{"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy},
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -144,6 +144,17 @@ json task_params::to_json(bool only_metrics) const {
|
||||
//
|
||||
// task_result_state
|
||||
//
|
||||
task_result_state::task_result_state(const common_chat_parser_params & chat_parser_params)
|
||||
: chat_parser_params(chat_parser_params)
|
||||
, oai_resp_id("resp_" + random_string())
|
||||
, oai_resp_reasoning_id("rs_" + random_string())
|
||||
, oai_resp_message_id("msg_" + random_string()) {
|
||||
if (!chat_parser_params.echo) {
|
||||
// initialize chat_msg to avoid emitting a delta containing the assistant prefill
|
||||
chat_msg = common_chat_parse("", true, chat_parser_params);
|
||||
}
|
||||
}
|
||||
|
||||
common_chat_msg task_result_state::update_chat_msg(
|
||||
const std::string & text_added,
|
||||
bool is_partial,
|
||||
@@ -421,6 +432,7 @@ task_params server_task::params_from_json_cmpl(
|
||||
if (data.contains("chat_parser")) {
|
||||
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
|
||||
}
|
||||
params.chat_parser_params.echo = json_value(data, "echo", false);
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -112,11 +112,7 @@ struct task_result_state {
|
||||
const std::string oai_resp_message_id;
|
||||
std::string oai_resp_fc_id; // function call ID for current args delta
|
||||
|
||||
task_result_state(const common_chat_parser_params & chat_parser_params)
|
||||
: chat_parser_params(chat_parser_params)
|
||||
, oai_resp_id("resp_" + random_string())
|
||||
, oai_resp_reasoning_id("rs_" + random_string())
|
||||
, oai_resp_message_id("msg_" + random_string()) {}
|
||||
task_result_state(const common_chat_parser_params & chat_parser_params);
|
||||
|
||||
// parse partial tool calls and update the internal state
|
||||
common_chat_msg update_chat_msg(
|
||||
|
||||
@@ -86,7 +86,10 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
common_params_print_info(params);
|
||||
// router server never loads a model and must not touch the GPU
|
||||
// skip device enumeration so the CUDA primary context stays uncreated
|
||||
const bool is_router_server = params.model.path.empty();
|
||||
common_params_print_info(params, !is_router_server);
|
||||
|
||||
// validate batch size for embeddings
|
||||
// embeddings require all tokens to be processed in a single ubatch
|
||||
@@ -126,7 +129,6 @@ int main(int argc, char ** argv) {
|
||||
server_routes routes(params, ctx_server);
|
||||
server_tools tools;
|
||||
|
||||
bool is_router_server = params.model.path.empty();
|
||||
std::optional<server_models_routes> models_routes{};
|
||||
if (is_router_server) {
|
||||
// setup server instances manager
|
||||
|
||||
@@ -158,11 +158,12 @@ def test_chat_template():
|
||||
|
||||
@pytest.mark.parametrize("prefill,re_prefill", [
|
||||
("Whill", "Whill"),
|
||||
([{"type": "text", "text": "Wh"}, {"type": "text", "text": "ill"}], "Whill"),
|
||||
([{"type": "text", "text": "Wh"}, {"type": "text", "text": "ill"}], "Wh\n\nill"),
|
||||
])
|
||||
def test_chat_template_assistant_prefill(prefill, re_prefill):
|
||||
global server
|
||||
server.chat_template = "llama3"
|
||||
server.jinja = True
|
||||
server.chat_template_file = "../../../models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja"
|
||||
server.debug = True # to get the "__verbose" object in the response
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
@@ -175,14 +176,15 @@ def test_chat_template_assistant_prefill(prefill, re_prefill):
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__verbose" in res.body
|
||||
assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"
|
||||
assert res.body["__verbose"]["prompt"].endswith(f"<|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}")
|
||||
|
||||
|
||||
def test_chat_template_continue_final_message_vllm_compat():
|
||||
"""continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic.
|
||||
Both must produce the same prompt."""
|
||||
global server
|
||||
server.chat_template = "llama3"
|
||||
server.jinja = True
|
||||
server.chat_template_file = "../../../models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja"
|
||||
server.debug = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
@@ -197,7 +199,7 @@ def test_chat_template_continue_final_message_vllm_compat():
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__verbose" in res.body
|
||||
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill"
|
||||
assert res.body["__verbose"]["prompt"].endswith("<|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill")
|
||||
|
||||
|
||||
def test_chat_template_continue_final_message_mutual_exclusion():
|
||||
|
||||
2
tools/ui/.env.example
Normal file
2
tools/ui/.env.example
Normal file
@@ -0,0 +1,2 @@
|
||||
VITE_PUBLIC_APP_NAME='llama-ui'
|
||||
# VITE_DEBUG='true'
|
||||
2
tools/ui/.gitignore
vendored
2
tools/ui/.gitignore
vendored
@@ -25,4 +25,4 @@ vite.config.ts.timestamp-*
|
||||
|
||||
*storybook.log
|
||||
storybook-static
|
||||
*.code-workspace
|
||||
*.code-workspace
|
||||
|
||||
@@ -14,12 +14,7 @@ endif()
|
||||
set(TARGET_SRCS "")
|
||||
set(UI_COMPILE_DEFS "")
|
||||
|
||||
# Support both old (LLAMA_BUILD_WEBUI) and new (LLAMA_BUILD_UI) option names
|
||||
if(LLAMA_BUILD_WEBUI OR LLAMA_BUILD_UI)
|
||||
if(LLAMA_BUILD_WEBUI AND NOT LLAMA_BUILD_UI)
|
||||
message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
|
||||
endif()
|
||||
|
||||
if(LLAMA_BUILD_UI)
|
||||
set(PUBLIC_ASSETS
|
||||
index.html
|
||||
bundle.js
|
||||
@@ -125,19 +120,17 @@ if(LLAMA_BUILD_WEBUI OR LLAMA_BUILD_UI)
|
||||
endforeach()
|
||||
|
||||
list(APPEND UI_COMPILE_DEFS
|
||||
LLAMA_BUILD_WEBUI # Deprecated: use LLAMA_BUILD_UI
|
||||
LLAMA_BUILD_UI
|
||||
LLAMA_WEBUI_DEFAULT_ENABLED=1 # Deprecated: use LLAMA_UI_DEFAULT_ENABLED
|
||||
LLAMA_UI_DEFAULT_ENABLED=1
|
||||
)
|
||||
message(STATUS "UI: embedded with source: ${UI_SOURCE}")
|
||||
else()
|
||||
message(WARNING "UI: no source available. Neither local build (build/tools/ui/dist/) nor HF Bucket download succeeded.")
|
||||
message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.")
|
||||
list(APPEND UI_COMPILE_DEFS LLAMA_WEBUI_DEFAULT_ENABLED=0 LLAMA_UI_DEFAULT_ENABLED=0)
|
||||
list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0)
|
||||
endif()
|
||||
else()
|
||||
list(APPEND UI_COMPILE_DEFS LLAMA_WEBUI_DEFAULT_ENABLED=0 LLAMA_UI_DEFAULT_ENABLED=0)
|
||||
list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0)
|
||||
endif()
|
||||
|
||||
# Build the static library
|
||||
|
||||
@@ -20,9 +20,7 @@ export default ts.config(
|
||||
prettier,
|
||||
...svelte.configs.prettier,
|
||||
{
|
||||
languageOptions: {
|
||||
globals: { ...globals.browser, ...globals.node }
|
||||
},
|
||||
languageOptions: { globals: { ...globals.browser, ...globals.node } },
|
||||
rules: {
|
||||
// typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects.
|
||||
// see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors
|
||||
@@ -30,6 +28,7 @@ export default ts.config(
|
||||
'svelte/no-at-html-tags': 'off',
|
||||
// This app uses hash-based routing (#/) where resolve() from $app/paths does not apply
|
||||
'svelte/no-navigation-without-resolve': 'off',
|
||||
|
||||
// Enforce empty line at end of file
|
||||
'eol-last': 'error'
|
||||
}
|
||||
|
||||
60
tools/ui/package-lock.json
generated
60
tools/ui/package-lock.json
generated
@@ -2307,9 +2307,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@sveltejs/kit": {
|
||||
"version": "2.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
|
||||
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
|
||||
"version": "2.60.1",
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz",
|
||||
"integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -2318,7 +2318,7 @@
|
||||
"@types/cookie": "^0.6.0",
|
||||
"acorn": "^8.14.1",
|
||||
"cookie": "^0.6.0",
|
||||
"devalue": "^5.6.4",
|
||||
"devalue": "^5.8.1",
|
||||
"esm-env": "^1.2.2",
|
||||
"kleur": "^4.1.5",
|
||||
"magic-string": "^0.30.5",
|
||||
@@ -4296,9 +4296,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/devalue": {
|
||||
"version": "5.6.4",
|
||||
"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz",
|
||||
"integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==",
|
||||
"version": "5.8.1",
|
||||
"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz",
|
||||
"integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/devlop": {
|
||||
@@ -4856,12 +4856,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/express-rate-limit": {
|
||||
"version": "8.5.0",
|
||||
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz",
|
||||
"integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==",
|
||||
"version": "8.5.2",
|
||||
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz",
|
||||
"integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ip-address": "10.1.0"
|
||||
"ip-address": "^10.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 16"
|
||||
@@ -4909,9 +4909,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fast-uri": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
|
||||
"integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
|
||||
"version": "3.1.2",
|
||||
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz",
|
||||
"integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
@@ -5541,9 +5541,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/hono": {
|
||||
"version": "4.12.14",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz",
|
||||
"integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==",
|
||||
"version": "4.12.19",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz",
|
||||
"integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=16.9.0"
|
||||
@@ -5722,9 +5722,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/ip-address": {
|
||||
"version": "10.1.0",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
|
||||
"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
|
||||
"version": "10.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz",
|
||||
"integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 12"
|
||||
@@ -6008,9 +6008,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/katex": {
|
||||
"version": "0.16.22",
|
||||
"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.22.tgz",
|
||||
"integrity": "sha512-XCHRdUw4lf3SKBaJe4EvgqIuWwkPSo9XoeO8GjQW94Bp7TWv9hNhzZjZ+OH9yf1UmLygb7DIT5GSFQiyt16zYg==",
|
||||
"version": "0.16.47",
|
||||
"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.47.tgz",
|
||||
"integrity": "sha512-Eeo8Ys1doU1z+x8AZsPpQu+p/QcZBI5PeOo7QGQdy2x2m0MU/hYagBbGOmXwr5KVbEfVuWv9LpnQWeehogurjg==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://opencollective.com/katex",
|
||||
@@ -9245,9 +9245,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/svelte": {
|
||||
"version": "5.55.1",
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz",
|
||||
"integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==",
|
||||
"version": "5.55.7",
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz",
|
||||
"integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/remapping": "^2.3.4",
|
||||
@@ -9259,7 +9259,7 @@
|
||||
"aria-query": "5.3.1",
|
||||
"axobject-query": "^4.1.0",
|
||||
"clsx": "^2.1.1",
|
||||
"devalue": "^5.6.4",
|
||||
"devalue": "^5.8.1",
|
||||
"esm-env": "^1.2.1",
|
||||
"esrap": "^2.2.4",
|
||||
"is-reference": "^3.0.3",
|
||||
@@ -10606,9 +10606,9 @@
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"version": "8.20.1",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz",
|
||||
"integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
@import 'tailwindcss';
|
||||
|
||||
@source '.';
|
||||
@plugin '@tailwindcss/forms';
|
||||
@plugin '@tailwindcss/typography';
|
||||
@import 'tw-animate-css';
|
||||
|
||||
@custom-variant dark (&:is(.dark *));
|
||||
@@ -39,6 +41,9 @@
|
||||
--sidebar-ring: oklch(0.708 0 0);
|
||||
--code-background: oklch(0.985 0 0);
|
||||
--code-foreground: oklch(0.145 0 0);
|
||||
--font-mono:
|
||||
ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
|
||||
'Liberation Mono', Menlo, monospace;
|
||||
--layer-popover: 1000000;
|
||||
|
||||
--chat-form-area-height: 8rem;
|
||||
@@ -171,6 +176,10 @@
|
||||
*::-webkit-scrollbar-thumb:hover {
|
||||
background: hsl(var(--muted-foreground) / 0.5);
|
||||
}
|
||||
|
||||
:where(code, pre, kbd, samp) {
|
||||
font-family: var(--font-mono);
|
||||
}
|
||||
}
|
||||
|
||||
@layer utilities {
|
||||
|
||||
2
tools/ui/src/app.d.ts
vendored
2
tools/ui/src/app.d.ts
vendored
@@ -39,6 +39,7 @@ import type {
|
||||
DatabaseMessage,
|
||||
DatabaseMessageExtra,
|
||||
DatabaseMessageExtraAudioFile,
|
||||
DatabaseMessageExtraVideoFile,
|
||||
DatabaseMessageExtraImageFile,
|
||||
DatabaseMessageExtraTextFile,
|
||||
DatabaseMessageExtraPdfFile,
|
||||
@@ -102,6 +103,7 @@ declare global {
|
||||
DatabaseMessage,
|
||||
DatabaseMessageExtra,
|
||||
DatabaseMessageExtraAudioFile,
|
||||
DatabaseMessageExtraVideoFile,
|
||||
DatabaseMessageExtraImageFile,
|
||||
DatabaseMessageExtraTextFile,
|
||||
DatabaseMessageExtraPdfFile,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user