mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f1d81a0b6 | ||
|
|
a47667cff4 | ||
|
|
ea5d7478b1 | ||
|
|
49271efbaf | ||
|
|
0ab30f8d82 | ||
|
|
cddae4884c | ||
|
|
7ea8d80d53 | ||
|
|
42c76d1358 | ||
|
|
9f7d4bcf5c | ||
|
|
1d1ccce676 | ||
|
|
9fe94ccac9 | ||
|
|
66b039a501 |
@@ -1,18 +1,16 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
@@ -24,13 +22,12 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV GGML_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
RUN make -j$(nproc)
|
||||
# Use the default CUDA archs if not specified
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc) && \
|
||||
cp build/bin/* .
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
@@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
apt-get install -y build-essential git cmake
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV GGML_CUDA=1
|
||||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
# Use the default CUDA archs if not specified
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/llama-cli /llama-cli
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
ARG CUDA_VERSION=12.6.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
@@ -8,33 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable CUDA
|
||||
ENV GGML_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
RUN make -j$(nproc) llama-server
|
||||
# Use the default CUDA archs if not specified
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
mpi,
|
||||
blas,
|
||||
cudaPackages,
|
||||
autoAddDriverRunpath,
|
||||
darwin,
|
||||
rocmPackages,
|
||||
vulkan-headers,
|
||||
@@ -192,10 +193,7 @@ effectiveStdenv.mkDerivation (
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
# TODO: Replace with autoAddDriverRunpath
|
||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
||||
cudaPackages.autoAddOpenGLRunpathHook
|
||||
autoAddDriverRunpath
|
||||
]
|
||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
||||
glibc.static
|
||||
|
||||
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
@@ -96,21 +96,12 @@ jobs:
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Build and push Docker image (versioned)
|
||||
- name: Build and push Docker image (tagged + versioned)
|
||||
if: github.event_name == 'push'
|
||||
uses: docker/build-push-action@v4
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
- name: Build and push Docker image (tagged)
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
push: ${{ github.event_name == 'push' }}
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
|
||||
@@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
|
||||
return cpu_get_num_physical_cores();
|
||||
}
|
||||
|
||||
// Helper for setting process priority
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||
return true;
|
||||
}
|
||||
|
||||
DWORD p = NORMAL_PRIORITY_CLASS;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
||||
}
|
||||
|
||||
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
||||
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#else // MacOS and POSIX
|
||||
#include <sys/types.h>
|
||||
#include <sys/resource.h>
|
||||
|
||||
bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int p = 0;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
||||
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
||||
}
|
||||
|
||||
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
||||
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
@@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
||||
int32_t n_set = 0;
|
||||
|
||||
if (cpuparams.n_threads < 0) {
|
||||
// Assuming everything about cpuparams is invalid
|
||||
if (role_model != nullptr) {
|
||||
cpuparams = *role_model;
|
||||
} else {
|
||||
cpuparams.n_threads = cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
|
||||
if (cpuparams.cpumask[i]) {
|
||||
n_set++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_set && n_set < cpuparams.n_threads) {
|
||||
// Not enough set bits, may experience performance issues.
|
||||
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
||||
}
|
||||
}
|
||||
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
@@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
||||
|
||||
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
}
|
||||
@@ -331,7 +411,7 @@ void gpt_params_parse_from_env(gpt_params & params) {
|
||||
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
||||
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
||||
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
||||
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
||||
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
|
||||
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
||||
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
||||
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
||||
@@ -368,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||
size_t dash_loc = range.find('-');
|
||||
if (dash_loc == std::string::npos) {
|
||||
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t start_i;
|
||||
size_t end_i;
|
||||
|
||||
if (dash_loc == 0) {
|
||||
start_i = 0;
|
||||
} else {
|
||||
start_i = std::stoull(range.substr(0, dash_loc));
|
||||
if (start_i >= GGML_MAX_N_THREADS) {
|
||||
fprintf(stderr, "Start index out of bounds!\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dash_loc == range.length() - 1) {
|
||||
end_i = GGML_MAX_N_THREADS - 1;
|
||||
} else {
|
||||
end_i = std::stoull(range.substr(dash_loc + 1));
|
||||
if (end_i >= GGML_MAX_N_THREADS) {
|
||||
fprintf(stderr, "End index out of bounds!\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = start_i; i <= end_i; i++) {
|
||||
boolmask[i] = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||
// Discard potential 0x prefix
|
||||
size_t start_i = 0;
|
||||
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
||||
start_i = 2;
|
||||
}
|
||||
|
||||
size_t num_digits = mask.length() - start_i;
|
||||
if (num_digits > 128) num_digits = 128;
|
||||
|
||||
size_t end_i = num_digits + start_i;
|
||||
|
||||
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
||||
char c = mask.at(i);
|
||||
int8_t id = c;
|
||||
|
||||
if ((c >= '0' && c <= '9')) {
|
||||
id -= '0';
|
||||
} else if (c >= 'a' && c <= 'f') {
|
||||
id -= 'a' - 10;
|
||||
} else if (c >= 'A' && c <= 'F') {
|
||||
id -= 'A' - 10;
|
||||
} else {
|
||||
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
||||
return false;
|
||||
}
|
||||
|
||||
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
||||
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
||||
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
||||
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
||||
|
||||
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
||||
@@ -384,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
if (arg == "-t" || arg == "--threads") {
|
||||
CHECK_ARG
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
if (params.n_threads <= 0) {
|
||||
params.n_threads = std::thread::hardware_concurrency();
|
||||
params.cpuparams.n_threads = std::stoi(argv[i]);
|
||||
if (params.cpuparams.n_threads <= 0) {
|
||||
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-C" || arg == "--cpu-mask") {
|
||||
CHECK_ARG
|
||||
std::string mask = argv[i];
|
||||
params.cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Cr" || arg == "--cpu-range") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio") {
|
||||
CHECK_ARG
|
||||
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict") {
|
||||
CHECK_ARG
|
||||
params.cpuparams.strict_cpu = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll") {
|
||||
CHECK_ARG
|
||||
params.cpuparams.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-tb" || arg == "--threads-batch") {
|
||||
CHECK_ARG
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
if (params.n_threads_batch <= 0) {
|
||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
||||
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||
if (params.cpuparams_batch.n_threads <= 0) {
|
||||
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
|
||||
CHECK_ARG
|
||||
std::string mask = argv[i];
|
||||
params.cpuparams_batch.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Crb" || arg == "--cpu-range_batch") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.cpuparams_batch.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio-batch") {
|
||||
CHECK_ARG
|
||||
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict-batch") {
|
||||
params.cpuparams_batch.strict_cpu = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll-batch") {
|
||||
CHECK_ARG
|
||||
params.cpuparams_batch.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-td" || arg == "--threads-draft") {
|
||||
CHECK_ARG
|
||||
params.n_threads_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_draft <= 0) {
|
||||
params.n_threads_draft = std::thread::hardware_concurrency();
|
||||
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
|
||||
if (params.draft_cpuparams.n_threads <= 0) {
|
||||
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
|
||||
CHECK_ARG
|
||||
std::string mask = argv[i];
|
||||
params.draft_cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Crd" || arg == "--cpu-range-draft") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.draft_cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict-draft") {
|
||||
params.draft_cpuparams.strict_cpu = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
||||
CHECK_ARG
|
||||
params.n_threads_batch_draft = std::stoi(argv[i]);
|
||||
if (params.n_threads_batch_draft <= 0) {
|
||||
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
||||
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
||||
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
|
||||
CHECK_ARG
|
||||
std::string range = argv[i];
|
||||
params.draft_cpuparams_batch.mask_valid = true;
|
||||
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--prio-batch-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--cpu-strict-batch-draft") {
|
||||
params.draft_cpuparams_batch.strict_cpu = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--poll-batch-draft") {
|
||||
CHECK_ARG
|
||||
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-p" || arg == "--prompt") {
|
||||
CHECK_ARG
|
||||
params.prompt = argv[i];
|
||||
@@ -1498,11 +1757,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
|
||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
// these options are available only with the internal threadpool
|
||||
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
|
||||
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
|
||||
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
|
||||
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
|
||||
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
|
||||
|
||||
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
|
||||
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
|
||||
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
|
||||
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
|
||||
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
|
||||
|
||||
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
|
||||
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
|
||||
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
|
||||
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
|
||||
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
|
||||
|
||||
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
|
||||
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
|
||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
|
||||
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
|
||||
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
|
||||
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
|
||||
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
|
||||
#endif // GGML_USE_OPENMP
|
||||
|
||||
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
||||
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
||||
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
||||
@@ -1774,7 +2062,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
||||
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
@@ -1806,9 +2093,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
std::string gpt_params_get_system_info(const gpt_params & params) {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "system_info: n_threads = " << params.n_threads;
|
||||
if (params.n_threads_batch != -1) {
|
||||
os << " (n_threads_batch = " << params.n_threads_batch << ")";
|
||||
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
||||
if (params.cpuparams_batch.n_threads != -1) {
|
||||
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
||||
}
|
||||
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
||||
// TODO: windows + arm64 + mingw64
|
||||
@@ -2332,8 +2619,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
cparams.n_seq_max = params.n_parallel;
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_ubatch = params.n_ubatch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||
cparams.seed = params.seed;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
@@ -2359,6 +2647,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||
return cparams;
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
||||
struct ggml_threadpool_params tpp;
|
||||
|
||||
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
||||
|
||||
if (params.mask_valid) {
|
||||
std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
|
||||
}
|
||||
|
||||
tpp.prio = params.priority;
|
||||
tpp.poll = params.poll;
|
||||
tpp.strict_cpu = params.strict_cpu;
|
||||
|
||||
return tpp;
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
|
||||
static bool starts_with(const std::string & str, const std::string & prefix) {
|
||||
@@ -3348,7 +3652,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||
|
||||
@@ -67,13 +67,18 @@ enum dimre_method {
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
int32_t n_threads = cpu_get_num_math();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
@@ -100,6 +105,11 @@ struct gpt_params {
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
struct cpu_params draft_cpuparams;
|
||||
struct cpu_params draft_cpuparams_batch;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
@@ -204,7 +214,7 @@ struct gpt_params {
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "";
|
||||
@@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
||||
bool set_process_priority(enum ggml_sched_priority prio);
|
||||
|
||||
//
|
||||
// String utils
|
||||
//
|
||||
@@ -327,8 +342,9 @@ struct llama_init_result {
|
||||
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import logging
|
||||
import argparse
|
||||
import contextlib
|
||||
@@ -298,9 +299,12 @@ class Model:
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||
)
|
||||
)
|
||||
or not name.endswith(".weight")
|
||||
or not new_name.endswith(".weight")
|
||||
):
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
@@ -2716,6 +2720,84 @@ class StarCoder2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
|
||||
|
||||
@Model.register("Rwkv6ForCausalLM")
|
||||
class Rwkv6Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.RWKV6
|
||||
|
||||
def set_vocab(self):
|
||||
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
||||
vocab_size = self.hparams.get("vocab_size", 65536)
|
||||
|
||||
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
||||
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
||||
|
||||
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
parts = line.split(' ')
|
||||
assert len(parts) >= 3
|
||||
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
||||
token = token.encode("utf-8") if isinstance(token, str) else token
|
||||
assert isinstance(token, bytes)
|
||||
assert len(token) == token_len
|
||||
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
||||
tokens.append(token_text.encode("utf-8"))
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
remainder = vocab_size - len(tokens)
|
||||
assert remainder >= 0
|
||||
for i in range(len(tokens), vocab_size):
|
||||
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
head_size = self.hparams["head_size"]
|
||||
hidden_size = self.hparams["hidden_size"]
|
||||
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
||||
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
||||
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
||||
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
||||
|
||||
# RWKV isn't context limited
|
||||
self.gguf_writer.add_context_length(1048576)
|
||||
self.gguf_writer.add_embedding_length(hidden_size)
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
||||
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
||||
self.gguf_writer.add_wkv_head_size(head_size)
|
||||
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
||||
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
||||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
# required by llama.cpp, unused
|
||||
self.gguf_writer.add_head_count(0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
||||
new_name += ".weight"
|
||||
|
||||
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
||||
data_torch = data_torch.transpose(0, 1)
|
||||
|
||||
if new_name.endswith("time_mix_w2.weight"):
|
||||
data_torch = data_torch.permute(0, 2, 1)
|
||||
|
||||
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||
if rescale_every_n_layers > 0:
|
||||
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
||||
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
|
||||
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
||||
class MambaModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||
|
||||
@@ -336,12 +336,12 @@ Choose one of following methods to run.
|
||||
- Use device 0:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run_llama2.sh 0
|
||||
./examples/sycl/run-llama2.sh 0
|
||||
```
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run_llama2.sh
|
||||
./examples/sycl/run-llama2.sh
|
||||
```
|
||||
|
||||
2. Command line
|
||||
|
||||
@@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `CUDA_VERSION` set to `11.7.1`
|
||||
- `CUDA_DOCKER_ARCH` set to `all`
|
||||
- `CUDA_VERSION` set to `12.6.0`
|
||||
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
|
||||
|
||||
The resulting images, are essentially the same as the non-CUDA images:
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
|
||||
#endif
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
#endif
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
@@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
||||
|
||||
struct benchmark_params_struct {
|
||||
int32_t n_threads = 1;
|
||||
int n_threads = 1;
|
||||
int32_t n_iterations = 10;
|
||||
};
|
||||
|
||||
|
||||
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
|
||||
if (use_pca) {
|
||||
// run PCA
|
||||
PCA::pca_params pca_params;
|
||||
pca_params.n_threads = params.n_threads;
|
||||
pca_params.n_batch = params.n_pca_batch;
|
||||
pca_params.n_threads = params.cpuparams.n_threads;
|
||||
pca_params.n_batch = params.n_pca_batch;
|
||||
pca_params.n_iterations = params.n_pca_iterations;
|
||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||
} else {
|
||||
|
||||
@@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
g_verbose = (params.verbosity == 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||
ctx.run_merge();
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s\n", err.what());
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
@@ -225,6 +226,9 @@ struct cmd_params {
|
||||
std::vector<ggml_type> type_k;
|
||||
std::vector<ggml_type> type_v;
|
||||
std::vector<int> n_threads;
|
||||
std::vector<std::string> cpu_mask;
|
||||
std::vector<bool> cpu_strict;
|
||||
std::vector<int> poll;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
@@ -236,6 +240,8 @@ struct cmd_params {
|
||||
std::vector<bool> embeddings;
|
||||
ggml_numa_strategy numa;
|
||||
int reps;
|
||||
ggml_sched_priority prio;
|
||||
int delay;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
@@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* cpu_mask */ {"0x0"},
|
||||
/* cpu_strict */ {false},
|
||||
/* poll */ {50},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
@@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||
/* delay */ 0,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
@@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
@@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
@@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||
params.reps = cmd_params_defaults.reps;
|
||||
params.numa = cmd_params_defaults.numa;
|
||||
params.prio = cmd_params_defaults.prio;
|
||||
params.delay = cmd_params_defaults.delay;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
@@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
||||
} else if (arg == "--cpu-strict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
||||
} else if (arg == "--poll") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.reps = std::stoi(argv[i]);
|
||||
} else if (arg == "--prio") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
||||
} else if (arg == "--delay") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.delay = std::stoi(argv[i]);
|
||||
} else if (arg == "-o" || arg == "--output") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
||||
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
||||
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
||||
|
||||
return params;
|
||||
}
|
||||
@@ -598,6 +652,9 @@ struct cmd_params_instance {
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_threads;
|
||||
std::string cpu_mask;
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
int n_gpu_layers;
|
||||
std::string rpc_servers;
|
||||
llama_split_mode split_mode;
|
||||
@@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & nkvo : params.no_kv_offload)
|
||||
for (const auto & fa : params.flash_attn)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
for (const auto & nt : params.n_threads)
|
||||
for (const auto & cm : params.cpu_mask)
|
||||
for (const auto & cs : params.cpu_strict)
|
||||
for (const auto & pl : params.poll) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
continue;
|
||||
@@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
@@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
@@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
@@ -769,6 +838,9 @@ struct test {
|
||||
int n_batch;
|
||||
int n_ubatch;
|
||||
int n_threads;
|
||||
std::string cpu_mask;
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
bool has_rpc;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
@@ -795,6 +867,9 @@ struct test {
|
||||
n_batch = inst.n_batch;
|
||||
n_ubatch = inst.n_ubatch;
|
||||
n_threads = inst.n_threads;
|
||||
cpu_mask = inst.cpu_mask;
|
||||
cpu_strict = inst.cpu_strict;
|
||||
poll = inst.poll;
|
||||
has_rpc = !inst.rpc_servers.empty();
|
||||
type_k = inst.type_k;
|
||||
type_v = inst.type_v;
|
||||
@@ -872,13 +947,14 @@ struct test {
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_ubatch",
|
||||
"n_threads", "type_k", "type_v",
|
||||
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn",
|
||||
"tensor_split", "use_mmap", "embeddings",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts"
|
||||
"avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@@ -887,7 +963,7 @@ struct test {
|
||||
|
||||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
||||
field == "n_threads" ||
|
||||
field == "n_threads" || field == "poll" ||
|
||||
field == "model_size" || field == "model_n_params" ||
|
||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||
field == "n_prompt" || field == "n_gen" ||
|
||||
@@ -896,6 +972,7 @@ struct test {
|
||||
}
|
||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "cpu_strict" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||
return BOOL;
|
||||
}
|
||||
@@ -928,7 +1005,8 @@ struct test {
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
||||
ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||
@@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
|
||||
return -30;
|
||||
}
|
||||
if (field == "t/s") {
|
||||
return 16;
|
||||
return 20;
|
||||
}
|
||||
if (field == "size" || field == "params") {
|
||||
return 10;
|
||||
@@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
|
||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
fields.emplace_back("n_threads");
|
||||
}
|
||||
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
||||
fields.emplace_back("cpu_mask");
|
||||
}
|
||||
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
||||
fields.emplace_back("cpu_strict");
|
||||
}
|
||||
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
||||
fields.emplace_back("poll");
|
||||
}
|
||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||
fields.emplace_back("n_batch");
|
||||
}
|
||||
@@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
set_process_priority(params.prio);
|
||||
|
||||
// initialize printer
|
||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||
@@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// cool off before the test
|
||||
if (params.delay) {
|
||||
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||
exit(1);
|
||||
}
|
||||
tpp.strict_cpu = t.cpu_strict;
|
||||
tpp.poll = t.poll;
|
||||
tpp.prio = params.prio;
|
||||
|
||||
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
llama_attach_threadpool(ctx, threadpool, NULL);
|
||||
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||
@@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) {
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
}
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
||||
@@ -71,8 +71,8 @@ actor LlamaContext {
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 2048
|
||||
ctx_params.n_threads = UInt32(n_threads)
|
||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
||||
ctx_params.n_threads = Int32(n_threads)
|
||||
ctx_params.n_threads_batch = Int32(n_threads)
|
||||
|
||||
let context = llama_new_context_with_model(model, ctx_params)
|
||||
guard let context else {
|
||||
|
||||
@@ -1623,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
||||
}
|
||||
}
|
||||
|
||||
inline float clip(float x, float lower, float upper) {
|
||||
inline int clip(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
@@ -1827,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
|
||||
return refine_size;
|
||||
}
|
||||
|
||||
inline int clip(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||
std::vector<int> candidate_split_grids_nums;
|
||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||
|
||||
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
||||
if (!params->image.empty()) {
|
||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
|
||||
@@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||
|
||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
auto ctx_clip = clip_init_context(params);
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
return NULL;
|
||||
|
||||
@@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("%s: llama threadpool init = n_threads = %d\n",
|
||||
__func__,
|
||||
(int) params.cpuparams.n_threads
|
||||
);
|
||||
struct ggml_threadpool_params tpp_batch =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||
struct ggml_threadpool_params tpp =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
||||
|
||||
set_process_priority(params.cpuparams.priority);
|
||||
|
||||
struct ggml_threadpool * threadpool_batch = NULL;
|
||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||
if (!threadpool_batch) {
|
||||
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Start the non-batch threadpool in the paused state
|
||||
tpp.paused = true;
|
||||
}
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||
if (ctx_guidance) {
|
||||
llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
@@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
|
||||
llama_sampling_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free(threadpool_batch);
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
LOG_TEE("Log end\n");
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
@@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
LOG_INFO("system info", {
|
||||
{"n_threads", params.n_threads},
|
||||
{"n_threads_batch", params.n_threads_batch},
|
||||
{"n_threads", params.cpuparams.n_threads},
|
||||
{"n_threads_batch", params.cpuparams_batch.n_threads},
|
||||
{"total_threads", std::thread::hardware_concurrency()},
|
||||
{"system_info", llama_print_system_info()},
|
||||
});
|
||||
@@ -2572,7 +2572,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
auto res_error = [](httplib::Response & res, json error_data) {
|
||||
json final_response {{"error", error_data}};
|
||||
res.set_content(final_response.dump(), MIMETYPE_JSON);
|
||||
res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
|
||||
res.status = json_value(error_data, "code", 500);
|
||||
};
|
||||
|
||||
|
||||
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
|
||||
// load the draft model
|
||||
params.model = params.model_draft;
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
if (params.n_threads_draft > 0) {
|
||||
params.n_threads = params.n_threads_draft;
|
||||
if (params.draft_cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
||||
}
|
||||
params.n_threads_batch = params.n_threads_batch_draft;
|
||||
|
||||
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
||||
model_dft = llama_init_dft.model;
|
||||
ctx_dft = llama_init_dft.context;
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1723637854,
|
||||
"narHash": "sha256-med8+5DSWa2UnOqtdICndjDAEjxr5D7zaIiK4pn0Q7c=",
|
||||
"lastModified": 1724224976,
|
||||
"narHash": "sha256-Z/ELQhrSd7bMzTO8r7NZgi9g5emh+aRKoCdaAv5fiO0=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "c3aa7b8938b17aebd2deecf7be0636000d62a2b9",
|
||||
"rev": "c374d94f1536013ca8e92341b540eba4c22f9c62",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -7,8 +7,8 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
// Tensor allocator
|
||||
struct ggml_tallocr {
|
||||
|
||||
@@ -103,6 +103,7 @@ extern "C" {
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
|
||||
@@ -231,6 +231,8 @@
|
||||
#define GGML_MAX_SRC 10
|
||||
#ifndef GGML_MAX_NAME
|
||||
#define GGML_MAX_NAME 64
|
||||
#define GGML_MAX_N_THREADS 512
|
||||
|
||||
#endif
|
||||
#define GGML_MAX_OP_PARAMS 64
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
@@ -512,6 +514,7 @@ extern "C" {
|
||||
GGML_OP_WIN_UNPART,
|
||||
GGML_OP_GET_REL_POS,
|
||||
GGML_OP_ADD_REL_POS,
|
||||
GGML_OP_RWKV_WKV,
|
||||
|
||||
GGML_OP_UNARY,
|
||||
|
||||
@@ -546,6 +549,7 @@ extern "C" {
|
||||
GGML_UNARY_OP_SILU,
|
||||
GGML_UNARY_OP_HARDSWISH,
|
||||
GGML_UNARY_OP_HARDSIGMOID,
|
||||
GGML_UNARY_OP_EXP,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
@@ -628,6 +632,29 @@ extern "C" {
|
||||
// If it returns true, the computation is aborted
|
||||
typedef bool (*ggml_abort_callback)(void * data);
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
@@ -635,6 +662,7 @@ extern "C" {
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
||||
int n_threads;
|
||||
struct ggml_threadpool * threadpool;
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback;
|
||||
@@ -1139,6 +1167,14 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_exp(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_exp_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// normalize along rows
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1887,6 +1923,15 @@ extern "C" {
|
||||
struct ggml_tensor * pw,
|
||||
struct ggml_tensor * ph);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * r,
|
||||
struct ggml_tensor * tf,
|
||||
struct ggml_tensor * td,
|
||||
struct ggml_tensor * state);
|
||||
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||
@@ -2057,10 +2102,23 @@ extern "C" {
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
@@ -1247,7 +1247,7 @@ endif()
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity and
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
add_compile_definitions(_GNU_SOURCE)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
#endif
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
void * work_data;
|
||||
size_t work_size;
|
||||
int n_threads;
|
||||
ggml_threadpool_t threadpool;
|
||||
|
||||
void * work_data;
|
||||
size_t work_size;
|
||||
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
|
||||
if (cpu_ctx->work_size < cplan.work_size) {
|
||||
free(cpu_ctx->work_data);
|
||||
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
}
|
||||
|
||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||
ctx->threadpool = NULL;
|
||||
ctx->work_data = NULL;
|
||||
ctx->work_size = 0;
|
||||
ctx->abort_callback = NULL;
|
||||
@@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
|
||||
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||
// already had a different threadpool, pause/suspend it before switching
|
||||
ggml_threadpool_pause(ctx->threadpool);
|
||||
}
|
||||
ctx->threadpool = threadpool;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
|
||||
1078
ggml/src/ggml.c
1078
ggml/src/ggml.c
File diff suppressed because it is too large
Load Diff
@@ -606,17 +606,29 @@ class tinyBLAS_Q0_AVX {
|
||||
case 0x44:
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemm4xN<4>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemm4xN<3>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 3>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemmMx4<3>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<3, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
@@ -626,12 +638,20 @@ class tinyBLAS_Q0_AVX {
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemm4xN<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 2>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemmMx4<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<2, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
#else
|
||||
case 0x44:
|
||||
@@ -639,13 +659,21 @@ class tinyBLAS_Q0_AVX {
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemm4xN<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 2>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x34:
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemmMx4<2>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<2, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x33:
|
||||
#endif
|
||||
@@ -662,7 +690,11 @@ class tinyBLAS_Q0_AVX {
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemm4xN<1>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<4, 1>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
@@ -672,7 +704,11 @@ class tinyBLAS_Q0_AVX {
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
gemmMx4<1>(m0, m, n0, n);
|
||||
#else
|
||||
gemm<1, 4>(m0, m, n0, n);
|
||||
#endif
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
@@ -708,6 +744,119 @@ class tinyBLAS_Q0_AVX {
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
#if defined(__AVX2__) && defined(__F16C__)
|
||||
// Templated functions for gemm of dimensions 4xN
|
||||
template <int RN>
|
||||
NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / 4;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * 4;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
__m256 Cv[RN][4] = {};
|
||||
for (int64_t l = 0; l < k; ++l) {
|
||||
uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
|
||||
// Convert delta values for four blocks to float values
|
||||
__m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
|
||||
__m256i avec0 = load(A + lda * (ii + 0) + l);
|
||||
__m256i avec1 = load(A + lda * (ii + 1) + l);
|
||||
__m256i avec2 = load(A + lda * (ii + 2) + l);
|
||||
__m256i avec3 = load(A + lda * (ii + 3) + l);
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
__m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
|
||||
// Computation of product of delta values for four blocks and replicate it across 256 bit lane
|
||||
__m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
|
||||
dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
|
||||
// Computation of dot product and multiplication with appropriate delta value products
|
||||
Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
|
||||
updot(_mm256_sign_epi8(avec0, avec0),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
|
||||
Cv[j][0]);
|
||||
Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
|
||||
updot(_mm256_sign_epi8(avec1, avec1),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
|
||||
Cv[j][1]);
|
||||
Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
|
||||
updot(_mm256_sign_epi8(avec2, avec2),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
|
||||
Cv[j][2]);
|
||||
Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
|
||||
updot(_mm256_sign_epi8(avec3, avec3),
|
||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
|
||||
Cv[j][3]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < 4; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Templated functions for gemm of dimensions Mx4
|
||||
template <int RM>
|
||||
NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / 4;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * 4;
|
||||
__m256 Cv[4][RM] = {};
|
||||
for (int64_t l = 0; l < k; ++l) {
|
||||
uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
|
||||
// Convert delta values for four blocks to float values
|
||||
__m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
|
||||
__m256i bvec0 = load(B + ldb * (jj + 0) + l);
|
||||
__m256i bvec1 = load(B + ldb * (jj + 1) + l);
|
||||
__m256i bvec2 = load(B + ldb * (jj + 2) + l);
|
||||
__m256i bvec3 = load(B + ldb * (jj + 3) + l);
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
__m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
|
||||
// Computation of product of delta values for four blocks and replicate it across 256 bit lane
|
||||
__m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
|
||||
dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
|
||||
// Computation of dot product and multiplication with appropriate delta value products
|
||||
Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
|
||||
Cv[0][i]);
|
||||
Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
|
||||
Cv[1][i]);
|
||||
Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
|
||||
Cv[2][i]);
|
||||
Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
|
||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||
load(A + lda * (ii + i) + l)),
|
||||
_mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
|
||||
Cv[3][i]);
|
||||
}
|
||||
}
|
||||
for (int64_t j = 0; j < 4; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
|
||||
@@ -94,6 +94,9 @@ class Keys:
|
||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
||||
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
||||
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -132,6 +135,9 @@ class Keys:
|
||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
||||
|
||||
class WKV:
|
||||
HEAD_SIZE = "{arch}.wkv.head_size"
|
||||
|
||||
class Tokenizer:
|
||||
MODEL = "tokenizer.ggml.model"
|
||||
PRE = "tokenizer.ggml.pre"
|
||||
@@ -207,6 +213,7 @@ class MODEL_ARCH(IntEnum):
|
||||
GEMMA = auto()
|
||||
GEMMA2 = auto()
|
||||
STARCODER2 = auto()
|
||||
RWKV6 = auto()
|
||||
MAMBA = auto()
|
||||
XVERSE = auto()
|
||||
COMMAND_R = auto()
|
||||
@@ -270,6 +277,29 @@ class MODEL_TENSOR(IntEnum):
|
||||
SSM_A = auto()
|
||||
SSM_D = auto()
|
||||
SSM_OUT = auto()
|
||||
TIME_MIX_W1 = auto()
|
||||
TIME_MIX_W2 = auto()
|
||||
TIME_MIX_LERP_X = auto()
|
||||
TIME_MIX_LERP_K = auto()
|
||||
TIME_MIX_LERP_V = auto()
|
||||
TIME_MIX_LERP_R = auto()
|
||||
TIME_MIX_LERP_G = auto()
|
||||
TIME_MIX_LERP_W = auto()
|
||||
TIME_MIX_FIRST = auto()
|
||||
TIME_MIX_DECAY = auto()
|
||||
TIME_MIX_DECAY_W1 = auto()
|
||||
TIME_MIX_DECAY_W2 = auto()
|
||||
TIME_MIX_KEY = auto()
|
||||
TIME_MIX_VALUE = auto()
|
||||
TIME_MIX_RECEPTANCE = auto()
|
||||
TIME_MIX_GATE = auto()
|
||||
TIME_MIX_LN = auto()
|
||||
TIME_MIX_OUTPUT = auto()
|
||||
CHANNEL_MIX_LERP_K = auto()
|
||||
CHANNEL_MIX_LERP_R = auto()
|
||||
CHANNEL_MIX_KEY = auto()
|
||||
CHANNEL_MIX_RECEPTANCE = auto()
|
||||
CHANNEL_MIX_VALUE = auto()
|
||||
ATTN_Q_A = auto()
|
||||
ATTN_Q_B = auto()
|
||||
ATTN_KV_A_MQA = auto()
|
||||
@@ -337,6 +367,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.RWKV6: "rwkv6",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.XVERSE: "xverse",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
@@ -355,87 +386,110 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
||||
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
||||
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
||||
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
|
||||
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
|
||||
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
|
||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
|
||||
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
|
||||
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
|
||||
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
|
||||
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
|
||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
|
||||
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
|
||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
@@ -856,6 +910,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.RWKV6: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.TIME_MIX_W1,
|
||||
MODEL_TENSOR.TIME_MIX_W2,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_X,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_K,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_V,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_R,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_G,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_W,
|
||||
MODEL_TENSOR.TIME_MIX_FIRST,
|
||||
MODEL_TENSOR.TIME_MIX_DECAY,
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||
MODEL_TENSOR.TIME_MIX_KEY,
|
||||
MODEL_TENSOR.TIME_MIX_VALUE,
|
||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
||||
MODEL_TENSOR.TIME_MIX_GATE,
|
||||
MODEL_TENSOR.TIME_MIX_LN,
|
||||
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
|
||||
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
||||
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
||||
],
|
||||
MODEL_ARCH.MAMBA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -670,6 +670,18 @@ class GGUFWriter:
|
||||
def add_expert_weights_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_rescale_every_n_layers(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
|
||||
|
||||
def add_time_mix_extra_dim(self, dim: int) -> None:
|
||||
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
|
||||
|
||||
def add_time_decay_extra_dim(self, dim: int) -> None:
|
||||
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
|
||||
|
||||
def add_wkv_head_size(self, size: int) -> None:
|
||||
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
||||
|
||||
def add_layer_norm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ class TensorNameMap:
|
||||
"embedding.word_embeddings", # chatglm
|
||||
"transformer.token_embeddings", # openelm
|
||||
"shared", # t5
|
||||
"rwkv.embeddings", # rwkv
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -40,6 +41,7 @@ class TensorNameMap:
|
||||
"embeddings.LayerNorm", # bert
|
||||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
"rwkv.blocks.0.pre_ln", # rwkv
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
@@ -57,6 +59,7 @@ class TensorNameMap:
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
"output_layer", # chatglm
|
||||
"head", # rwkv
|
||||
),
|
||||
|
||||
# Output norm
|
||||
@@ -76,6 +79,7 @@ class TensorNameMap:
|
||||
"encoder.final_layernorm", # chatglm
|
||||
"transformer.norm", # openelm
|
||||
"model.norm", # nemotron
|
||||
"rwkv.ln_out", # rwkv
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
@@ -108,12 +112,14 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
"encoder.layers.{bid}.input_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
"rwkv.blocks.{bid}.ln1", # rwkv
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
MODEL_TENSOR.ATTN_NORM_2: (
|
||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
||||
"rwkv.blocks.{bid}.ln2", # rwkv
|
||||
),
|
||||
|
||||
# Attention query-key-value
|
||||
@@ -434,6 +440,98 @@ class TensorNameMap:
|
||||
"backbone.layers.{bid}.mixer.out_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_W1: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_W2: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_X: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_K: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_V: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_R: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_G: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_W: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_FIRST: (
|
||||
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_DECAY: (
|
||||
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
|
||||
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
|
||||
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_KEY: (
|
||||
"rwkv.blocks.{bid}.attention.key", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_VALUE: (
|
||||
"rwkv.blocks.{bid}.attention.value", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
|
||||
"rwkv.blocks.{bid}.attention.receptance", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_GATE: (
|
||||
"rwkv.blocks.{bid}.attention.gate", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LN: (
|
||||
"rwkv.blocks.{bid}.attention.ln_x", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_OUTPUT: (
|
||||
"rwkv.blocks.{bid}.attention.output", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
|
||||
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
|
||||
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_KEY: (
|
||||
"rwkv.blocks.{bid}.feed_forward.key", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
||||
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
|
||||
"rwkv.blocks.{bid}.feed_forward.value", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_A: (
|
||||
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
||||
),
|
||||
|
||||
@@ -66,6 +66,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
||||
};
|
||||
|
||||
// pre-tokenization types
|
||||
@@ -267,9 +268,9 @@ extern "C" {
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_MODE_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
@@ -304,8 +305,8 @@ extern "C" {
|
||||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||
uint32_t n_ubatch; // physical maximum batch size
|
||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
@@ -428,6 +429,13 @@ extern "C" {
|
||||
//optional:
|
||||
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||
|
||||
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
||||
LLAMA_API void llama_attach_threadpool(
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch);
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
||||
@@ -837,13 +845,13 @@ extern "C" {
|
||||
// Set the number of threads used for decoding
|
||||
// n_threads is the number of threads used for generation (single token)
|
||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
|
||||
|
||||
// Get the number of threads used for generation of a single token.
|
||||
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
||||
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
|
||||
|
||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
|
||||
// Set whether the model is in embeddings mode or not
|
||||
// If true, embeddings will be returned but logits will not
|
||||
|
||||
@@ -58,17 +58,17 @@ struct naive_trie {
|
||||
auto res = children.find(c);
|
||||
if (res != children.end()) {
|
||||
return res->second.get_longest_prefix(key, len, offset + 1);
|
||||
} else {
|
||||
return std::make_pair(key, offset);
|
||||
}
|
||||
|
||||
return std::make_pair(key, offset);
|
||||
}
|
||||
struct naive_trie * traverse(const char c) {
|
||||
const struct naive_trie * traverse(const char c) const {
|
||||
auto res = children.find(c);
|
||||
if (res != children.end()) {
|
||||
return &res->second;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
std::map<char, struct naive_trie> children;
|
||||
bool has_value;
|
||||
@@ -843,7 +843,7 @@ struct llm_tokenizer_ugm {
|
||||
// traverse the token matcher trie to find a matching token
|
||||
bool single_codepoint_token_found = false;
|
||||
const struct best_tokenization & current_best = tokenization_results[input_offset];
|
||||
struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
|
||||
const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
|
||||
|
||||
while (prefix_offset <= input_len && node != NULL) {
|
||||
// check if we found valid token in prefix
|
||||
@@ -963,7 +963,7 @@ private:
|
||||
/*
|
||||
* This structure is a view wrapper for XOR-compressed double array (XCDA)
|
||||
* See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
|
||||
* Eeach bit-packed entry contains:
|
||||
* Each bit-packed entry contains:
|
||||
* - BASE array value in bits 10-30
|
||||
* - LCHECK array value in bits 0-7
|
||||
* - LEAF array value in bit 9
|
||||
@@ -1097,6 +1097,111 @@ private:
|
||||
struct naive_trie token_matcher;
|
||||
};
|
||||
|
||||
//
|
||||
// RWKV tokenizer
|
||||
//
|
||||
|
||||
static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
|
||||
std::vector<uint8_t> output;
|
||||
output.reserve(escaped.size());
|
||||
|
||||
// Parser state
|
||||
bool escaping = false;
|
||||
uint8_t hex_remaining = 0;
|
||||
uint8_t hex_acc = 0;
|
||||
|
||||
// Step through characters, performing parsing
|
||||
for (const char & c : escaped) {
|
||||
// If we're parsing a hex code, interpret the next character
|
||||
if (hex_remaining != 0) {
|
||||
uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
|
||||
hex_acc = (hex_acc << 4) + value;
|
||||
|
||||
hex_remaining -= 1;
|
||||
if (hex_remaining == 0) {
|
||||
output.push_back(hex_acc);
|
||||
hex_acc = 0;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we got an escape character, interpret it
|
||||
if (escaping) {
|
||||
if (c == 't') {
|
||||
output.push_back('\t');
|
||||
} else if (c == 'n') {
|
||||
output.push_back('\n');
|
||||
} else if (c == 'r') {
|
||||
output.push_back('\r');
|
||||
} else if (c == 'x') {
|
||||
hex_remaining = 2;
|
||||
} else {
|
||||
output.push_back(c);
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == '\\') {
|
||||
escaping = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
output.push_back(c);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
struct llm_tokenizer_rwkv {
|
||||
llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) {
|
||||
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
|
||||
// For now, we decode the vocab here into the lookup we'll use for tokenization.
|
||||
|
||||
// build trie
|
||||
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
|
||||
const auto & token = vocab.id_to_token[id];
|
||||
const auto data = llama_unescape_rwkv_token(token.text);
|
||||
token_matcher.insert((const char *) data.data(), data.size(), id);
|
||||
}
|
||||
}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
uint32_t position = 0;
|
||||
|
||||
while (position < text.size()) {
|
||||
const struct naive_trie * node = token_matcher.traverse(text[position]);
|
||||
if (node == NULL) {
|
||||
// no matching token found, add unknown token
|
||||
output.push_back(vocab.special_unk_id);
|
||||
position += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// traverse the trie to find the longest matching token
|
||||
uint32_t token_id = 0;
|
||||
uint32_t token_length = 0;
|
||||
while (node != NULL) {
|
||||
if (node->has_value) {
|
||||
token_id = node->value;
|
||||
token_length = position + 1;
|
||||
}
|
||||
node = node->traverse(text[++position]);
|
||||
}
|
||||
|
||||
// add the longest matching token
|
||||
output.push_back(token_id);
|
||||
position = token_length;
|
||||
}
|
||||
}
|
||||
|
||||
const llama_vocab & vocab;
|
||||
|
||||
struct naive_trie token_matcher;
|
||||
};
|
||||
|
||||
//
|
||||
// (de-) tokenize
|
||||
//
|
||||
@@ -1401,6 +1506,23 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_RWKV:
|
||||
{
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||
#endif
|
||||
|
||||
llm_tokenizer_rwkv tokenizer(vocab);
|
||||
tokenizer.tokenize(raw_text, output);
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_NONE:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -1616,6 +1738,17 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_RWKV: {
|
||||
std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
|
||||
|
||||
// If we don't have enough space, return an error
|
||||
if (result.size() > (size_t)length) {
|
||||
return -(int)result.size();
|
||||
}
|
||||
|
||||
memcpy(buf, result.data(), result.size());
|
||||
return (int)result.size();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
581
src/llama.cpp
581
src/llama.cpp
@@ -212,6 +212,7 @@ enum llm_arch {
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -259,6 +260,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -295,6 +297,9 @@ enum llm_kv {
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
||||
LLM_KV_TIME_MIX_EXTRA_DIM,
|
||||
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@@ -330,6 +335,8 @@ enum llm_kv {
|
||||
LLM_KV_SSM_TIME_STEP_RANK,
|
||||
LLM_KV_SSM_DT_B_C_RMS,
|
||||
|
||||
LLM_KV_WKV_HEAD_SIZE,
|
||||
|
||||
LLM_KV_TOKENIZER_MODEL,
|
||||
LLM_KV_TOKENIZER_PRE,
|
||||
LLM_KV_TOKENIZER_LIST,
|
||||
@@ -389,11 +396,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
||||
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
||||
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@@ -429,6 +439,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
||||
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
|
||||
|
||||
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||
@@ -518,6 +530,29 @@ enum llm_tensor {
|
||||
LLM_TENSOR_SSM_A,
|
||||
LLM_TENSOR_SSM_D,
|
||||
LLM_TENSOR_SSM_OUT,
|
||||
LLM_TENSOR_TIME_MIX_W1,
|
||||
LLM_TENSOR_TIME_MIX_W2,
|
||||
LLM_TENSOR_TIME_MIX_LERP_X,
|
||||
LLM_TENSOR_TIME_MIX_LERP_W,
|
||||
LLM_TENSOR_TIME_MIX_LERP_K,
|
||||
LLM_TENSOR_TIME_MIX_LERP_V,
|
||||
LLM_TENSOR_TIME_MIX_LERP_R,
|
||||
LLM_TENSOR_TIME_MIX_LERP_G,
|
||||
LLM_TENSOR_TIME_MIX_FIRST,
|
||||
LLM_TENSOR_TIME_MIX_DECAY,
|
||||
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
||||
LLM_TENSOR_TIME_MIX_DECAY_W2,
|
||||
LLM_TENSOR_TIME_MIX_KEY,
|
||||
LLM_TENSOR_TIME_MIX_VALUE,
|
||||
LLM_TENSOR_TIME_MIX_RECEPTANCE,
|
||||
LLM_TENSOR_TIME_MIX_GATE,
|
||||
LLM_TENSOR_TIME_MIX_LN,
|
||||
LLM_TENSOR_TIME_MIX_OUTPUT,
|
||||
LLM_TENSOR_CHANNEL_MIX_LERP_K,
|
||||
LLM_TENSOR_CHANNEL_MIX_LERP_R,
|
||||
LLM_TENSOR_CHANNEL_MIX_KEY,
|
||||
LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
|
||||
LLM_TENSOR_CHANNEL_MIX_VALUE,
|
||||
LLM_TENSOR_ATTN_Q_A,
|
||||
LLM_TENSOR_ATTN_Q_B,
|
||||
LLM_TENSOR_ATTN_KV_A_MQA,
|
||||
@@ -1339,6 +1374,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RWKV6,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
||||
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
||||
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
||||
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
||||
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
||||
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
|
||||
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
||||
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
||||
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
||||
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
|
||||
{ LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
|
||||
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
||||
{ LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
|
||||
{ LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
|
||||
{ LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
|
||||
{ LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
|
||||
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2151,6 +2220,7 @@ enum e_model {
|
||||
MODEL_1B,
|
||||
MODEL_1_3B,
|
||||
MODEL_1_4B,
|
||||
MODEL_1_6B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
@@ -2228,6 +2298,12 @@ struct llama_hparams {
|
||||
float f_attn_logit_softcapping = 50.0f;
|
||||
float f_final_logit_softcapping = 30.0f;
|
||||
|
||||
// for RWKV
|
||||
uint32_t rescale_every_n_layers = 0;
|
||||
uint32_t time_mix_extra_dim = 0;
|
||||
uint32_t time_decay_extra_dim = 0;
|
||||
uint32_t wkv_head_size = 0;
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
@@ -2291,6 +2367,11 @@ struct llama_hparams {
|
||||
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
||||
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
||||
|
||||
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
||||
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
||||
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
||||
if (this->wkv_head_size != other.wkv_head_size) return true;
|
||||
|
||||
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
||||
|
||||
const float EPSILON = 1e-9f;
|
||||
@@ -2354,15 +2435,25 @@ struct llama_hparams {
|
||||
}
|
||||
|
||||
uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
|
||||
// corresponds to Mamba's conv_states size
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
||||
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
||||
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
||||
if (wkv_head_size != 0) {
|
||||
// for RWKV models
|
||||
return 2 * n_embd;
|
||||
} else {
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
||||
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
||||
// corresponds to Mamba's ssm_states size
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
if (wkv_head_size != 0) {
|
||||
// corresponds to RWKV's wkv_states size
|
||||
return n_embd * wkv_head_size;
|
||||
} else {
|
||||
// corresponds to Mamba's ssm_states size
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -2373,8 +2464,8 @@ struct llama_cparams {
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int n_threads; // number of threads to use for generation
|
||||
int n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
@@ -2501,6 +2592,36 @@ struct llama_layer {
|
||||
struct ggml_tensor * ssm_conv1d_b;
|
||||
struct ggml_tensor * ssm_dt_b;
|
||||
|
||||
// rwkv
|
||||
struct ggml_tensor * time_mix_w1;
|
||||
struct ggml_tensor * time_mix_w2;
|
||||
struct ggml_tensor * time_mix_lerp_x;
|
||||
struct ggml_tensor * time_mix_lerp_w;
|
||||
struct ggml_tensor * time_mix_lerp_k;
|
||||
struct ggml_tensor * time_mix_lerp_v;
|
||||
struct ggml_tensor * time_mix_lerp_r;
|
||||
struct ggml_tensor * time_mix_lerp_g;
|
||||
|
||||
struct ggml_tensor * time_mix_first;
|
||||
struct ggml_tensor * time_mix_decay;
|
||||
struct ggml_tensor * time_mix_decay_w1;
|
||||
struct ggml_tensor * time_mix_decay_w2;
|
||||
struct ggml_tensor * time_mix_key;
|
||||
struct ggml_tensor * time_mix_value;
|
||||
struct ggml_tensor * time_mix_receptance;
|
||||
struct ggml_tensor * time_mix_gate;
|
||||
|
||||
struct ggml_tensor * time_mix_ln;
|
||||
struct ggml_tensor * time_mix_ln_b;
|
||||
struct ggml_tensor * time_mix_output;
|
||||
|
||||
struct ggml_tensor * channel_mix_lerp_k;
|
||||
struct ggml_tensor * channel_mix_lerp_r;
|
||||
|
||||
struct ggml_tensor * channel_mix_key;
|
||||
struct ggml_tensor * channel_mix_receptance;
|
||||
struct ggml_tensor * channel_mix_value;
|
||||
|
||||
// long rope factors
|
||||
struct ggml_tensor * rope_long = nullptr;
|
||||
struct ggml_tensor * rope_short = nullptr;
|
||||
@@ -3091,6 +3212,9 @@ struct llama_context {
|
||||
#endif
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
|
||||
ggml_threadpool_t threadpool = nullptr;
|
||||
ggml_threadpool_t threadpool_batch = nullptr;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
int64_t t_start_us;
|
||||
@@ -3423,7 +3547,7 @@ static bool llama_kv_cache_find_slot(
|
||||
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// For recurrent state architectures (like Mamba),
|
||||
// For recurrent state architectures (like Mamba or RWKV),
|
||||
// each cache cell can store the state for a whole sequence.
|
||||
// A slot should be always be contiguous.
|
||||
|
||||
@@ -3672,7 +3796,7 @@ static bool llama_kv_cache_seq_rm(
|
||||
if (p0 < 0) p0 = 0;
|
||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||
|
||||
// models like Mamba can't have a state partially erased
|
||||
// models like Mamba or RWKV can't have a state partially erased
|
||||
if (cache.recurrent) {
|
||||
if (seq_id >= (int64_t) cache.size) {
|
||||
// could be fatal
|
||||
@@ -3808,7 +3932,7 @@ static void llama_kv_cache_seq_add(
|
||||
if (p0 == p1) return;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// for Mamba-like models, only the pos needs to be shifted
|
||||
// for Mamba-like or RWKV models, only the pos needs to be shifted
|
||||
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
||||
const int32_t tail_id = cache.cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
@@ -3857,7 +3981,7 @@ static void llama_kv_cache_seq_div(
|
||||
if (p0 == p1) return;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// for Mamba-like models, only the pos needs to be changed
|
||||
// for Mamba-like or RWKV models, only the pos needs to be changed
|
||||
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
|
||||
const int32_t tail_id = cache.cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
@@ -5048,6 +5172,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_1_3B: return "1.3B";
|
||||
case MODEL_1_4B: return "1.4B";
|
||||
case MODEL_1_6B: return "1.6B";
|
||||
case MODEL_2B: return "2B";
|
||||
case MODEL_2_8B: return "2.8B";
|
||||
case MODEL_3B: return "3B";
|
||||
@@ -5094,6 +5219,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
@@ -5790,6 +5916,26 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_RWKV6:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
||||
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 24: model.type = e_model::MODEL_1_6B; break;
|
||||
case 32:
|
||||
switch (hparams.n_embd) {
|
||||
case 2560: model.type = e_model::MODEL_3B; break;
|
||||
case 4096: model.type = e_model::MODEL_7B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 61: model.type = e_model::MODEL_14B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -5919,6 +6065,15 @@ static void llm_load_vocab(
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} else if (tokenizer_model == "rwkv") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
||||
|
||||
// default special tokens
|
||||
vocab.special_bos_id = -1;
|
||||
vocab.special_eos_id = -1;
|
||||
vocab.special_unk_id = -1;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||
}
|
||||
@@ -6050,6 +6205,12 @@ static void llm_load_vocab(
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
vocab.tokenizer_add_bos = false;
|
||||
vocab.tokenizer_add_eos = true;
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
vocab.tokenizer_add_bos = false;
|
||||
vocab.tokenizer_add_eos = false;
|
||||
} else {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
@@ -6154,6 +6315,10 @@ static void llm_load_vocab(
|
||||
}
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
|
||||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||||
vocab.linefeed_id = ids[0];
|
||||
} else {
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
||||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||||
@@ -8200,6 +8365,68 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_RWKV6:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// Block 0, LN0
|
||||
model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
||||
model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
||||
|
||||
// output
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
|
||||
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
||||
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
||||
const int head_size = hparams.wkv_head_size;
|
||||
const int attn_hidden_size = n_embd;
|
||||
const int ffn_size = hparams.n_ff_arr[0];
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
||||
|
||||
layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
|
||||
layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
|
||||
|
||||
layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
|
||||
layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
|
||||
layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
||||
layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
|
||||
layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
|
||||
layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
|
||||
|
||||
layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
|
||||
layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
|
||||
layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
|
||||
layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
|
||||
layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
|
||||
layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
|
||||
layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
|
||||
layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
|
||||
|
||||
layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
|
||||
layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
|
||||
layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
|
||||
|
||||
layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
||||
layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
|
||||
|
||||
layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
|
||||
layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
|
||||
layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
|
||||
}
|
||||
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -9159,6 +9386,171 @@ static struct ggml_tensor * llm_build_mamba(
|
||||
return cur;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * llm_build_rwkv6_time_mix(
|
||||
struct llama_context & lctx,
|
||||
struct ggml_context * ctx,
|
||||
const struct llama_layer * layer,
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * x_prev,
|
||||
struct ggml_tensor ** wkv_state) {
|
||||
size_t n_embed = cur->ne[0];
|
||||
size_t n_seq_tokens = cur->ne[1];
|
||||
size_t n_seqs = cur->ne[2];
|
||||
|
||||
size_t head_size = layer->time_mix_first->ne[0];
|
||||
size_t head_count = layer->time_mix_first->ne[1];
|
||||
|
||||
size_t n_tokens = n_seqs * n_seq_tokens;
|
||||
|
||||
struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
|
||||
|
||||
sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
|
||||
|
||||
struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
|
||||
|
||||
xxx = ggml_reshape_4d(
|
||||
ctx,
|
||||
ggml_tanh(
|
||||
ctx,
|
||||
ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
|
||||
),
|
||||
layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
|
||||
);
|
||||
|
||||
xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
|
||||
|
||||
xxx = ggml_mul_mat(
|
||||
ctx,
|
||||
ggml_reshape_4d(
|
||||
ctx,
|
||||
layer->time_mix_w2,
|
||||
layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
|
||||
),
|
||||
xxx
|
||||
);
|
||||
|
||||
struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
|
||||
struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
|
||||
struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
|
||||
struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
|
||||
struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
|
||||
|
||||
struct ggml_tensor * xw = ggml_add(
|
||||
ctx,
|
||||
ggml_mul(
|
||||
ctx,
|
||||
ggml_add(ctx, mw, layer->time_mix_lerp_w),
|
||||
sx
|
||||
),
|
||||
cur
|
||||
);
|
||||
|
||||
struct ggml_tensor * xk = ggml_add(
|
||||
ctx,
|
||||
ggml_mul(
|
||||
ctx,
|
||||
ggml_add(ctx, mk, layer->time_mix_lerp_k),
|
||||
sx
|
||||
),
|
||||
cur
|
||||
);
|
||||
|
||||
struct ggml_tensor * xv = ggml_add(
|
||||
ctx,
|
||||
ggml_mul(
|
||||
ctx,
|
||||
ggml_add(ctx, mv, layer->time_mix_lerp_v),
|
||||
sx
|
||||
),
|
||||
cur
|
||||
);
|
||||
|
||||
struct ggml_tensor * xr = ggml_add(
|
||||
ctx,
|
||||
ggml_mul(
|
||||
ctx,
|
||||
ggml_add(ctx, mr, layer->time_mix_lerp_r),
|
||||
sx
|
||||
),
|
||||
cur
|
||||
);
|
||||
|
||||
struct ggml_tensor * xg = ggml_add(
|
||||
ctx,
|
||||
ggml_mul(
|
||||
ctx,
|
||||
ggml_add(ctx, mg, layer->time_mix_lerp_g),
|
||||
sx
|
||||
),
|
||||
cur
|
||||
);
|
||||
|
||||
struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
|
||||
struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
|
||||
struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
|
||||
struct ggml_tensor * g = ggml_silu(
|
||||
ctx,
|
||||
llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
|
||||
);
|
||||
|
||||
struct ggml_tensor * w = ggml_mul_mat(
|
||||
ctx,
|
||||
layer->time_mix_decay_w2,
|
||||
ggml_tanh(
|
||||
ctx,
|
||||
ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
|
||||
)
|
||||
);
|
||||
|
||||
w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
|
||||
w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
|
||||
w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
|
||||
|
||||
k = ggml_transpose(ctx, k);
|
||||
v = ggml_transpose(ctx, v);
|
||||
r = ggml_transpose(ctx, r);
|
||||
|
||||
struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
||||
cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
|
||||
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
|
||||
|
||||
// group norm with head_count groups
|
||||
cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
|
||||
cur = ggml_norm(ctx, cur, 64e-5f);
|
||||
|
||||
// Convert back to regular vectors.
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
|
||||
cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
||||
|
||||
cur = ggml_mul(ctx, cur, g);
|
||||
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
||||
|
||||
return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
|
||||
}
|
||||
|
||||
static struct ggml_tensor * llm_build_rwkv6_channel_mix(
|
||||
struct llama_context & lctx,
|
||||
struct ggml_context * ctx,
|
||||
const struct llama_layer * layer,
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * x_prev) {
|
||||
struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
|
||||
struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
|
||||
struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
|
||||
|
||||
struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
|
||||
struct ggml_tensor * k = ggml_sqr(
|
||||
ctx,
|
||||
ggml_relu(
|
||||
ctx,
|
||||
llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
|
||||
)
|
||||
);
|
||||
|
||||
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
|
||||
}
|
||||
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
llama_context & lctx;
|
||||
@@ -14680,6 +15072,117 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * build_rwkv6() {
|
||||
ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// Token shift state dimensions should be 2 * n_emb
|
||||
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
|
||||
|
||||
const int64_t n_seqs = batch.n_seqs;
|
||||
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
struct ggml_tensor * state_copy = build_inp_s_copy();
|
||||
struct ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const llama_layer * layer = &model.layers[il];
|
||||
|
||||
// (ab)using the KV cache to store the states
|
||||
struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
|
||||
gf, kv_self.k_l[il], state_copy, state_mask,
|
||||
hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
||||
struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
|
||||
gf, kv_self.v_l[il], state_copy, state_mask,
|
||||
hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
|
||||
|
||||
struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||
struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
||||
|
||||
struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
|
||||
struct ggml_tensor * x_prev = ggml_concat(
|
||||
ctx0,
|
||||
att_shift,
|
||||
ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
|
||||
1
|
||||
);
|
||||
|
||||
cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ggml_build_forward_expand(
|
||||
gf,
|
||||
ggml_cpy(
|
||||
ctx0,
|
||||
wkv_states,
|
||||
ggml_view_1d(
|
||||
ctx0,
|
||||
kv_self.v_l[il],
|
||||
hparams.n_embd_v_s() * n_seqs,
|
||||
hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
|
||||
x_prev = ggml_concat(
|
||||
ctx0,
|
||||
ffn_shift,
|
||||
ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
|
||||
1
|
||||
);
|
||||
cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
|
||||
struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
|
||||
|
||||
token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
|
||||
|
||||
ggml_build_forward_expand(
|
||||
gf,
|
||||
ggml_cpy(
|
||||
ctx0,
|
||||
ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
|
||||
ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
|
||||
)
|
||||
);
|
||||
|
||||
if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
|
||||
cur = ggml_scale(ctx0, cur, 0.5F);
|
||||
}
|
||||
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -14926,6 +15429,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_exaone();
|
||||
} break;
|
||||
case LLM_ARCH_RWKV6:
|
||||
{
|
||||
result = llm.build_rwkv6();
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -15494,9 +16001,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
||||
}
|
||||
|
||||
static void llama_graph_compute(
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads) {
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads,
|
||||
ggml_threadpool * threadpool) {
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||
@@ -15505,6 +16013,7 @@ static void llama_graph_compute(
|
||||
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
}
|
||||
#ifdef GGML_USE_BLAS
|
||||
@@ -15625,6 +16134,8 @@ static int llama_decode_internal(
|
||||
}
|
||||
|
||||
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
// non-causal masks do not use the KV cache
|
||||
@@ -15686,7 +16197,7 @@ static int llama_decode_internal(
|
||||
|
||||
llama_set_inputs(lctx, ubatch);
|
||||
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||
|
||||
// update the kv ring buffer
|
||||
{
|
||||
@@ -15863,7 +16374,9 @@ static int llama_encode_internal(
|
||||
lctx.inp_embd_enc = NULL;
|
||||
lctx.n_outputs = n_tokens;
|
||||
|
||||
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched);
|
||||
@@ -15895,7 +16408,7 @@ static int llama_encode_internal(
|
||||
|
||||
llama_set_inputs(lctx, ubatch);
|
||||
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||
|
||||
// extract embeddings
|
||||
if (embd) {
|
||||
@@ -16177,7 +16690,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
@@ -16203,7 +16716,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||
|
||||
llama_set_k_shift(lctx);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
@@ -16964,6 +17477,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
// NOTE: can't use LLM_TN here because the layer number is not known
|
||||
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
||||
|
||||
// do not quantize RWKV's time_mix_first tensors
|
||||
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
||||
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
||||
|
||||
// do not quantize relative position bias (T5)
|
||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||
|
||||
@@ -17451,6 +17969,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_attach_threadpool(
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch) {
|
||||
ctx->threadpool = threadpool;
|
||||
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
|
||||
}
|
||||
|
||||
void llama_detach_threadpool(struct llama_context * ctx) {
|
||||
ctx->threadpool = nullptr;
|
||||
ctx->threadpool_batch = nullptr;
|
||||
}
|
||||
|
||||
void llama_backend_free(void) {
|
||||
ggml_quantize_free();
|
||||
}
|
||||
@@ -17955,6 +18486,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_T5:
|
||||
case LLM_ARCH_T5ENCODER:
|
||||
case LLM_ARCH_JAIS:
|
||||
case LLM_ARCH_RWKV6:
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
@@ -18123,6 +18655,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
|
||||
bool llama_model_is_recurrent(const struct llama_model * model) {
|
||||
switch (model->arch) {
|
||||
case LLM_ARCH_MAMBA: return true;
|
||||
case LLM_ARCH_RWKV6: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
@@ -19367,16 +19900,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
|
||||
}
|
||||
}
|
||||
|
||||
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
||||
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
|
||||
ctx->cparams.n_threads = n_threads;
|
||||
ctx->cparams.n_threads_batch = n_threads_batch;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads(struct llama_context * ctx) {
|
||||
int32_t llama_n_threads(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads_batch;
|
||||
}
|
||||
|
||||
|
||||
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
|
||||
Reference in New Issue
Block a user